Loading src/Benchmarks/Convolution/CMakeLists.txt +1 −2 Original line number Diff line number Diff line Loading @@ -68,5 +68,4 @@ GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "k GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "kernels/sharedKernel.h") GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "kernels/sharedDataAndKernel.h") GENERATE_CUDA_EXECUTABLE("HeatEquation" 2 "templates/main_heat_equation_solver.h" "kernels/naive.h") GENERATE_CUDA_EXECUTABLE("HeatEquation" 2 "templates/main_heat_equation_solver.h" "kernels/sharedDataAndKernel.h") GENERATE_CUDA_EXECUTABLE("HeatEquation" 2 "templates/main_heat_equation_solver.h" "kernels/heatEquationSharedData.h") src/Benchmarks/Convolution/kernels/heatEquationSharedData.h 0 → 100644 +182 −0 Original line number Diff line number Diff line #ifdef HAVE_CUDA /** * This method stores image tile into shared memory * and then calculates convolution. * * Thanks for the idea https://www.evl.uic.edu/sjames/cs525/final.html */ #include <TNL/Devices/Cuda.h> #include <TNL/Containers/StaticVector.h> #include <TNL/Cuda/LaunchHelpers.h> #include <TNL/Cuda/SharedMemory.h> template< int Dimension, typename Device > struct Convolution; template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename Convolve, typename Store > __global__ static void convolution2D( Index kernelWidth, Index kernelHeight, Index endX, Index endY, FetchData fetchData, FetchBoundary fetchBoundary, Convolve convolve, Store store ) { Real* data = TNL::Cuda::getSharedMemory< Real >(); const Index iy = threadIdx.y + blockIdx.y * blockDim.y; const Index ix = threadIdx.x + blockIdx.x * blockDim.x; const Index radiusY = kernelHeight >> 1; const Index radiusX = kernelWidth >> 1; const Index dataBlockWidth = 2 * kernelWidth - 1; const Index dataBlockHeight = 2 * kernelHeight - 1; const Index dataBlockRadiusX = dataBlockWidth >> 1; const Index dataBlockRadiusY = dataBlockHeight >> 1; Index x, y, index; // Top Left x = ix - radiusX; y = iy - radiusY; index = threadIdx.x + threadIdx.y * dataBlockWidth; if( x < 0 || y < 0 || x >= endX || y >= endY ) { data[ index ] = fetchBoundary( x, y ); } else { data[ index ] = fetchData( x, y ); } // Top right x = ix + radiusX; y = iy - radiusY; index = dataBlockRadiusX + threadIdx.x + threadIdx.y * dataBlockWidth; if( x < 0 || y < 0 || x >= endX || y >= endY ) { data[ index ] = fetchBoundary( x, y ); } else { data[ index ] = fetchData( x, y ); } // Bottom Left x = ix - radiusX; y = iy + radiusY; index = threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth; if(x < 0 || y < 0 || x >= endX || y >= endY ) { data[ index ] = fetchBoundary( x, y ); } else { data[ index ] = fetchData( x, y ); } // Bottom Right x = ix + radiusX; y = iy + radiusY; index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth; if( x < 0 || y < 0 || x >= endX || y >= endY ) { data[ index ] = fetchBoundary( x, y ); } else { data[ index ] = fetchData( x, y ); } __syncthreads(); if( ix >= endX || iy >= endY ) return; Real result = 0; for( Index j = 0; j < kernelHeight; j++ ) { Index align = ( j + threadIdx.y ) * dataBlockWidth; for( Index i = 0; i < kernelWidth; i++ ) { Index index = i + threadIdx.x + align; result = convolve( result, ix, iy, i, j, data[ index ]); } } store( ix, iy, result ); } template<> struct Convolution< 2, TNL::Devices::Cuda > { public: template< typename Index > using Vector = TNL::Containers::StaticVector< 2, Index >; template< typename Index, typename Real > static void setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) { Index kernelElementCount = 1; for( Index i = 0; i < kernelSize.getSize(); i++ ) kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1; configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real ); configuration.blockSize.x = kernelSize.x(); configuration.blockSize.y = kernelSize.y(); configuration.gridSize.x = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); configuration.gridSize.y = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); } template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename Convolve, typename Store > static void execute( const Vector< Index >& dimensions, const Vector< Index >& kernelSize, FetchData&& fetchData, FetchBoundary&& fetchBoundary, Convolve&& convolve, Store&& store ) { TNL::Cuda::LaunchConfiguration configuration; setup< Index, Real >( configuration, dimensions, kernelSize ); constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, Convolve, Store >; TNL::Cuda::launchKernel< true >( kernel, 0, configuration, kernelSize.x(), kernelSize.y(), dimensions.x(), dimensions.y(), fetchData, fetchBoundary, convolve, store ); }; }; #endif src/Benchmarks/Convolution/support/HeatEquationSolver.h +27 −35 Original line number Diff line number Diff line Loading @@ -2,7 +2,7 @@ #pragma once #include "Solver.h" #include "DummyTask.h" #include "HeatEquationTask.h" #include <TNL/Algorithms/ParallelFor.h> #include <TNL/Timer.h> Loading @@ -11,8 +11,8 @@ static std::vector< TNL::String > dimensionIds = { "grid-x-size", "grid-y-size" static std::vector< TNL::String > kernelSizeIds = { "kernel-x-size", "kernel-y-size" }; static std::vector< TNL::String > domainIds = { "domain-x-size", "domain-y-size" }; static std::string sigmaKey = "sigma"; static std::string timestepKey = "timeStep"; static std::string finalTimeKey = "finalTime"; static std::string timeStepKey = "timeStep"; static std::string timeKey = "time"; static std::string outputFilenamePrefix = "outputFilenamePrefix"; template< typename Real = double > Loading Loading @@ -64,16 +64,29 @@ public: result.setLike( function ); result = 0; auto finalTime = parameters.getParameter< Real >( finalTimeKey ); auto timeStep = parameters.getParameter< double >( timeStepKey ); auto finalTime = parameters.getParameter< double >( timeKey ); convolve( dimensions, domain, spaceSteps, kernelSize, function.getConstView(), result.getView(), finalTime ); int iterationsCount = finalTime / timeStep; auto finalFilename = filenamePrefix + "_final.txt"; double time = timeStep; if( ! writeGNUPlot( finalFilename, dimensions, spaceSteps, domain, result.getConstView() ) ) { for (int i = 1; i <= iterationsCount; i++) { printf("Time: %lf\n", time); convolve( dimensions, domain, kernelSize, function.getConstView(), result.getView(), time ); auto filename = TNL::String("data_") + TNL::convertToString(i) + ".txt"; if( ! writeGNUPlot( filename, dimensions, spaceSteps, domain, result.getConstView() ) ) { std::cout << "Did fail during file write"; return; } result = 0; time += timeStep; } } virtual TNL::Config::ConfigDescription Loading @@ -82,8 +95,8 @@ public: TNL::Config::ConfigDescription config = Base::makeInputConfig(); config.addDelimiter( "Grid settings:" ); config.addEntry< int >( dimensionIds[ 0 ], "Grid size along x-axis.", 100 ); config.addEntry< int >( dimensionIds[ 1 ], "Grid size along y-axis.", 100 ); config.addEntry< int >( dimensionIds[ 0 ], "Grid size along x-axis.", 200 ); config.addEntry< int >( dimensionIds[ 1 ], "Grid size along y-axis.", 200 ); config.addDelimiter( "Kernel settings:" ); config.addEntry< int >( kernelSizeIds[ 0 ], "Kernel size along x-axis.", 3 ); Loading @@ -97,7 +110,8 @@ public: config.addEntry< Real >( sigmaKey, "Sigma in exponential initial condition.", 0.5); config.addEntry< Real >( finalTimeKey, "Final time of the simulation.", 0.12); config.addEntry< Real >( timeStepKey, "Time step of the simulation.", 0.005); config.addEntry< Real >( timeKey, "Final time of the simulation.", 0.36); return config; } Loading Loading @@ -136,34 +150,12 @@ public: void convolve( const Vector& dimensions, const Point& domain, const Point& spaceSteps, const Vector& kernelSize, typename DataStore::ConstViewType input, typename DataStore::ViewType result, const Real time ) const { HostDataStore kernel; kernel.resize( kernelSize.x() * kernelSize.y() ); for( int j = 0; j < kernelSize.y(); j++ ) { for( int i = 0; i < kernelSize.x(); i++ ) { int index = i + j * kernelSize.x(); auto x = i * spaceSteps.x() - domain.x() / 2.; auto y = j * spaceSteps.y() - domain.y() / 2.; kernel[ index ] = ( 1. / ( 4. * M_PI * time ) ) * exp( -( x * x + y * y ) / ( 4. * time ) ); } } std::cout << kernel << std::endl; DataStore kernelDevice( kernel ); auto kernelView = kernelDevice.getConstView(); DummyTask< int, Real, Dimension, Device >::exec( dimensions, kernelSize, input, result, kernelView, 0); HeatEquationTask< int, Real, Dimension, Device >::exec( dimensions, kernelSize, domain, { 3., 3. }, time, input, result); } bool Loading src/Benchmarks/Convolution/support/HeatEquationTask.h 0 → 100644 +94 −0 Original line number Diff line number Diff line #pragma once template< int Dimension, typename Device > struct Convolution { template< typename Index > using Vector = TNL::Containers::StaticVector< Dimension, Index >; template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename Convolve, typename Store > static void execute( const Vector< Index >& dimensions, const Vector< Index >& kernelSize, FetchData&& fetchData, FetchBoundary&& fetchBoundary, Convolve&& convolve, Store&& store ); }; template< typename Index, typename Real, int Dimension, typename Device > struct HeatEquationTask; template< typename Index, typename Real > struct HeatEquationTask< Index, Real, 2, TNL::Devices::Cuda > { public: static constexpr int Dimension = 2; using Device = TNL::Devices::Cuda; using Vector = TNL::Containers::StaticVector< Dimension, Index >; using Point = TNL::Containers::StaticVector< Dimension, Real >; using ConstDataStore = typename TNL::Containers::Vector< Real, Device, Index >::ConstViewType; using DataStore = typename TNL::Containers::Vector< Real, Device, Index >::ViewType; using ConvolutionLauncher = Convolution< Dimension, Device >; static void exec( const Vector& dimensions, const Vector& kernelSize, const Point& functionDomain, const Point& kernelDomain, const Real time, ConstDataStore& input, DataStore& result) { auto functionSpaceSteps = Point(functionDomain.x() / dimensions.x(), functionDomain.y() / dimensions.y()); auto kernelSpaceSteps = Point(kernelDomain.x() / kernelSize.x(), kernelDomain.y() / kernelSize.y()); auto fetchData = [ = ] __cuda_callable__( Index i, Index j ) { auto index = i + j * dimensions.x(); return input[ index ]; }; auto fetchBoundary = [ = ] __cuda_callable__( Index i, Index j ) { return 0; }; auto convolve = [ = ] __cuda_callable__( Real result, Index dataX, Index dataY, Index kernelX, Index kernelY, Real data ) { auto functionXPos = dataX * functionSpaceSteps.x() - (functionDomain.x() / 2), functionYPos = dataY * functionSpaceSteps.y() - (functionDomain.y() / 2); auto kernelXPos = (kernelX - kernelSize.x() / 2) * kernelSpaceSteps.x(), kernelYPos = (kernelY - kernelSize.y() / 2) * kernelSpaceSteps.y(); auto deltaXPos = kernelXPos - functionXPos, deltaYPos = kernelYPos - functionYPos; auto kernel = kernelSpaceSteps.x() * kernelSpaceSteps.y() * ( (Real)1 / ( (Real)4 * M_PI * time ) ) * exp( - ( pow(deltaXPos, 2.) + pow(deltaYPos, 2.) ) / ( (Real)4 * time ) ); return result + data * kernel; }; auto store = [ = ] __cuda_callable__( Index i, Index j, Real resultValue ) mutable { auto index = i + j * dimensions.x(); result[ index ] = resultValue; }; ConvolutionLauncher::execute< Index, Real >( dimensions, kernelSize, std::forward< decltype( fetchData ) >( fetchData ), std::forward< decltype( fetchBoundary ) >( fetchBoundary ), std::forward< decltype( convolve ) >( convolve ), std::forward< decltype( store ) >( store ) ); } }; src/Benchmarks/Convolution/kernels/sharedData.h +4 −4 File changed.Contains only whitespace changes. Show changes Loading
src/Benchmarks/Convolution/CMakeLists.txt +1 −2 Original line number Diff line number Diff line Loading @@ -68,5 +68,4 @@ GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "k GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "kernels/sharedKernel.h") GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "kernels/sharedDataAndKernel.h") GENERATE_CUDA_EXECUTABLE("HeatEquation" 2 "templates/main_heat_equation_solver.h" "kernels/naive.h") GENERATE_CUDA_EXECUTABLE("HeatEquation" 2 "templates/main_heat_equation_solver.h" "kernels/sharedDataAndKernel.h") GENERATE_CUDA_EXECUTABLE("HeatEquation" 2 "templates/main_heat_equation_solver.h" "kernels/heatEquationSharedData.h")
src/Benchmarks/Convolution/kernels/heatEquationSharedData.h 0 → 100644 +182 −0 Original line number Diff line number Diff line #ifdef HAVE_CUDA /** * This method stores image tile into shared memory * and then calculates convolution. * * Thanks for the idea https://www.evl.uic.edu/sjames/cs525/final.html */ #include <TNL/Devices/Cuda.h> #include <TNL/Containers/StaticVector.h> #include <TNL/Cuda/LaunchHelpers.h> #include <TNL/Cuda/SharedMemory.h> template< int Dimension, typename Device > struct Convolution; template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename Convolve, typename Store > __global__ static void convolution2D( Index kernelWidth, Index kernelHeight, Index endX, Index endY, FetchData fetchData, FetchBoundary fetchBoundary, Convolve convolve, Store store ) { Real* data = TNL::Cuda::getSharedMemory< Real >(); const Index iy = threadIdx.y + blockIdx.y * blockDim.y; const Index ix = threadIdx.x + blockIdx.x * blockDim.x; const Index radiusY = kernelHeight >> 1; const Index radiusX = kernelWidth >> 1; const Index dataBlockWidth = 2 * kernelWidth - 1; const Index dataBlockHeight = 2 * kernelHeight - 1; const Index dataBlockRadiusX = dataBlockWidth >> 1; const Index dataBlockRadiusY = dataBlockHeight >> 1; Index x, y, index; // Top Left x = ix - radiusX; y = iy - radiusY; index = threadIdx.x + threadIdx.y * dataBlockWidth; if( x < 0 || y < 0 || x >= endX || y >= endY ) { data[ index ] = fetchBoundary( x, y ); } else { data[ index ] = fetchData( x, y ); } // Top right x = ix + radiusX; y = iy - radiusY; index = dataBlockRadiusX + threadIdx.x + threadIdx.y * dataBlockWidth; if( x < 0 || y < 0 || x >= endX || y >= endY ) { data[ index ] = fetchBoundary( x, y ); } else { data[ index ] = fetchData( x, y ); } // Bottom Left x = ix - radiusX; y = iy + radiusY; index = threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth; if(x < 0 || y < 0 || x >= endX || y >= endY ) { data[ index ] = fetchBoundary( x, y ); } else { data[ index ] = fetchData( x, y ); } // Bottom Right x = ix + radiusX; y = iy + radiusY; index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth; if( x < 0 || y < 0 || x >= endX || y >= endY ) { data[ index ] = fetchBoundary( x, y ); } else { data[ index ] = fetchData( x, y ); } __syncthreads(); if( ix >= endX || iy >= endY ) return; Real result = 0; for( Index j = 0; j < kernelHeight; j++ ) { Index align = ( j + threadIdx.y ) * dataBlockWidth; for( Index i = 0; i < kernelWidth; i++ ) { Index index = i + threadIdx.x + align; result = convolve( result, ix, iy, i, j, data[ index ]); } } store( ix, iy, result ); } template<> struct Convolution< 2, TNL::Devices::Cuda > { public: template< typename Index > using Vector = TNL::Containers::StaticVector< 2, Index >; template< typename Index, typename Real > static void setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) { Index kernelElementCount = 1; for( Index i = 0; i < kernelSize.getSize(); i++ ) kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1; configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real ); configuration.blockSize.x = kernelSize.x(); configuration.blockSize.y = kernelSize.y(); configuration.gridSize.x = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); configuration.gridSize.y = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); } template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename Convolve, typename Store > static void execute( const Vector< Index >& dimensions, const Vector< Index >& kernelSize, FetchData&& fetchData, FetchBoundary&& fetchBoundary, Convolve&& convolve, Store&& store ) { TNL::Cuda::LaunchConfiguration configuration; setup< Index, Real >( configuration, dimensions, kernelSize ); constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, Convolve, Store >; TNL::Cuda::launchKernel< true >( kernel, 0, configuration, kernelSize.x(), kernelSize.y(), dimensions.x(), dimensions.y(), fetchData, fetchBoundary, convolve, store ); }; }; #endif
src/Benchmarks/Convolution/support/HeatEquationSolver.h +27 −35 Original line number Diff line number Diff line Loading @@ -2,7 +2,7 @@ #pragma once #include "Solver.h" #include "DummyTask.h" #include "HeatEquationTask.h" #include <TNL/Algorithms/ParallelFor.h> #include <TNL/Timer.h> Loading @@ -11,8 +11,8 @@ static std::vector< TNL::String > dimensionIds = { "grid-x-size", "grid-y-size" static std::vector< TNL::String > kernelSizeIds = { "kernel-x-size", "kernel-y-size" }; static std::vector< TNL::String > domainIds = { "domain-x-size", "domain-y-size" }; static std::string sigmaKey = "sigma"; static std::string timestepKey = "timeStep"; static std::string finalTimeKey = "finalTime"; static std::string timeStepKey = "timeStep"; static std::string timeKey = "time"; static std::string outputFilenamePrefix = "outputFilenamePrefix"; template< typename Real = double > Loading Loading @@ -64,16 +64,29 @@ public: result.setLike( function ); result = 0; auto finalTime = parameters.getParameter< Real >( finalTimeKey ); auto timeStep = parameters.getParameter< double >( timeStepKey ); auto finalTime = parameters.getParameter< double >( timeKey ); convolve( dimensions, domain, spaceSteps, kernelSize, function.getConstView(), result.getView(), finalTime ); int iterationsCount = finalTime / timeStep; auto finalFilename = filenamePrefix + "_final.txt"; double time = timeStep; if( ! writeGNUPlot( finalFilename, dimensions, spaceSteps, domain, result.getConstView() ) ) { for (int i = 1; i <= iterationsCount; i++) { printf("Time: %lf\n", time); convolve( dimensions, domain, kernelSize, function.getConstView(), result.getView(), time ); auto filename = TNL::String("data_") + TNL::convertToString(i) + ".txt"; if( ! writeGNUPlot( filename, dimensions, spaceSteps, domain, result.getConstView() ) ) { std::cout << "Did fail during file write"; return; } result = 0; time += timeStep; } } virtual TNL::Config::ConfigDescription Loading @@ -82,8 +95,8 @@ public: TNL::Config::ConfigDescription config = Base::makeInputConfig(); config.addDelimiter( "Grid settings:" ); config.addEntry< int >( dimensionIds[ 0 ], "Grid size along x-axis.", 100 ); config.addEntry< int >( dimensionIds[ 1 ], "Grid size along y-axis.", 100 ); config.addEntry< int >( dimensionIds[ 0 ], "Grid size along x-axis.", 200 ); config.addEntry< int >( dimensionIds[ 1 ], "Grid size along y-axis.", 200 ); config.addDelimiter( "Kernel settings:" ); config.addEntry< int >( kernelSizeIds[ 0 ], "Kernel size along x-axis.", 3 ); Loading @@ -97,7 +110,8 @@ public: config.addEntry< Real >( sigmaKey, "Sigma in exponential initial condition.", 0.5); config.addEntry< Real >( finalTimeKey, "Final time of the simulation.", 0.12); config.addEntry< Real >( timeStepKey, "Time step of the simulation.", 0.005); config.addEntry< Real >( timeKey, "Final time of the simulation.", 0.36); return config; } Loading Loading @@ -136,34 +150,12 @@ public: void convolve( const Vector& dimensions, const Point& domain, const Point& spaceSteps, const Vector& kernelSize, typename DataStore::ConstViewType input, typename DataStore::ViewType result, const Real time ) const { HostDataStore kernel; kernel.resize( kernelSize.x() * kernelSize.y() ); for( int j = 0; j < kernelSize.y(); j++ ) { for( int i = 0; i < kernelSize.x(); i++ ) { int index = i + j * kernelSize.x(); auto x = i * spaceSteps.x() - domain.x() / 2.; auto y = j * spaceSteps.y() - domain.y() / 2.; kernel[ index ] = ( 1. / ( 4. * M_PI * time ) ) * exp( -( x * x + y * y ) / ( 4. * time ) ); } } std::cout << kernel << std::endl; DataStore kernelDevice( kernel ); auto kernelView = kernelDevice.getConstView(); DummyTask< int, Real, Dimension, Device >::exec( dimensions, kernelSize, input, result, kernelView, 0); HeatEquationTask< int, Real, Dimension, Device >::exec( dimensions, kernelSize, domain, { 3., 3. }, time, input, result); } bool Loading
src/Benchmarks/Convolution/support/HeatEquationTask.h 0 → 100644 +94 −0 Original line number Diff line number Diff line #pragma once template< int Dimension, typename Device > struct Convolution { template< typename Index > using Vector = TNL::Containers::StaticVector< Dimension, Index >; template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename Convolve, typename Store > static void execute( const Vector< Index >& dimensions, const Vector< Index >& kernelSize, FetchData&& fetchData, FetchBoundary&& fetchBoundary, Convolve&& convolve, Store&& store ); }; template< typename Index, typename Real, int Dimension, typename Device > struct HeatEquationTask; template< typename Index, typename Real > struct HeatEquationTask< Index, Real, 2, TNL::Devices::Cuda > { public: static constexpr int Dimension = 2; using Device = TNL::Devices::Cuda; using Vector = TNL::Containers::StaticVector< Dimension, Index >; using Point = TNL::Containers::StaticVector< Dimension, Real >; using ConstDataStore = typename TNL::Containers::Vector< Real, Device, Index >::ConstViewType; using DataStore = typename TNL::Containers::Vector< Real, Device, Index >::ViewType; using ConvolutionLauncher = Convolution< Dimension, Device >; static void exec( const Vector& dimensions, const Vector& kernelSize, const Point& functionDomain, const Point& kernelDomain, const Real time, ConstDataStore& input, DataStore& result) { auto functionSpaceSteps = Point(functionDomain.x() / dimensions.x(), functionDomain.y() / dimensions.y()); auto kernelSpaceSteps = Point(kernelDomain.x() / kernelSize.x(), kernelDomain.y() / kernelSize.y()); auto fetchData = [ = ] __cuda_callable__( Index i, Index j ) { auto index = i + j * dimensions.x(); return input[ index ]; }; auto fetchBoundary = [ = ] __cuda_callable__( Index i, Index j ) { return 0; }; auto convolve = [ = ] __cuda_callable__( Real result, Index dataX, Index dataY, Index kernelX, Index kernelY, Real data ) { auto functionXPos = dataX * functionSpaceSteps.x() - (functionDomain.x() / 2), functionYPos = dataY * functionSpaceSteps.y() - (functionDomain.y() / 2); auto kernelXPos = (kernelX - kernelSize.x() / 2) * kernelSpaceSteps.x(), kernelYPos = (kernelY - kernelSize.y() / 2) * kernelSpaceSteps.y(); auto deltaXPos = kernelXPos - functionXPos, deltaYPos = kernelYPos - functionYPos; auto kernel = kernelSpaceSteps.x() * kernelSpaceSteps.y() * ( (Real)1 / ( (Real)4 * M_PI * time ) ) * exp( - ( pow(deltaXPos, 2.) + pow(deltaYPos, 2.) ) / ( (Real)4 * time ) ); return result + data * kernel; }; auto store = [ = ] __cuda_callable__( Index i, Index j, Real resultValue ) mutable { auto index = i + j * dimensions.x(); result[ index ] = resultValue; }; ConvolutionLauncher::execute< Index, Real >( dimensions, kernelSize, std::forward< decltype( fetchData ) >( fetchData ), std::forward< decltype( fetchBoundary ) >( fetchBoundary ), std::forward< decltype( convolve ) >( convolve ), std::forward< decltype( store ) >( store ) ); } };
src/Benchmarks/Convolution/kernels/sharedData.h +4 −4 File changed.Contains only whitespace changes. Show changes