diff --git a/src/Benchmarks/CMakeLists.txt b/src/Benchmarks/CMakeLists.txt index 50e46776282b3a13845337c26ddb9d83c31751ae..e3b14d8518bc57ee74964fe181eff630d6ef98bc 100644 --- a/src/Benchmarks/CMakeLists.txt +++ b/src/Benchmarks/CMakeLists.txt @@ -1,3 +1,4 @@ +add_subdirectory( Convolution ) add_subdirectory( HeatEquation ) add_subdirectory( BLAS ) add_subdirectory( NDArray ) diff --git a/src/Benchmarks/Convolution/.gitignore b/src/Benchmarks/Convolution/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..86d4c2dd380eb2e9e2a050e882f40bc65dc2a48b --- /dev/null +++ b/src/Benchmarks/Convolution/.gitignore @@ -0,0 +1 @@ +generated diff --git a/src/Benchmarks/Convolution/CMakeLists.txt b/src/Benchmarks/Convolution/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..31d46cbf70d34245f377bd5309e190629f7a903e --- /dev/null +++ b/src/Benchmarks/Convolution/CMakeLists.txt @@ -0,0 +1,77 @@ + +function(generate_cuda_executable PREFIX DIMENSION TEMPLATE KERNEL_HEADER) + +get_filename_component(MODULE_NAME ${KERNEL_HEADER} NAME_WE) +get_filename_component(TEMPLATE_NAME ${TEMPLATE} NAME_WE) + +if (${BUILD_CUDA}) + SET(SOURCE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/generated/${MODULE_NAME}_${DIMENSION}_${TEMPLATE_NAME}.cu") + + FILE(READ ${TEMPLATE} TEMPLATE_CONTENT) + + STRING(REGEX REPLACE "DIMENSION_VALUE" ${DIMENSION} TEMPLATE_CONTENT "${TEMPLATE_CONTENT}") + STRING(REGEX REPLACE "KERNEL_VALUE" "\"../${KERNEL_HEADER}\"" TEMPLATE_CONTENT "${TEMPLATE_CONTENT}") + + get_filename_component(ABSOLUTE_SUPPORT_PATH ${SOURCE_FILE} ABSOLUTE) + + if(NOT EXISTS ${ABSOLUTE_SUPPORT_PATH}) + FILE(WRITE ${ABSOLUTE_SUPPORT_PATH} "") + endif() + + FILE(READ ${SOURCE_FILE} SOURCE_FILE_CONTENT) + + if ( NOT "${SOURCE_FILE_CONTENT}" STREQUAL "${TEMPLATE_CONTENT}" ) + FILE(WRITE ${SOURCE_FILE} "${TEMPLATE_CONTENT}") + endif() + + SET(EXECUTABLE_NAME "${PREFIX}_${DIMENSION}_${MODULE_NAME}_${TEMPLATE_NAME}") + + CUDA_ADD_EXECUTABLE(${EXECUTABLE_NAME} ${SOURCE_FILE}) + + if( PNG_FOUND ) + target_link_libraries( ${EXECUTABLE_NAME} ${PNG_LIBRARIES} ) + endif() +else() + MESSAGE(WARNING "Convolutions are not supported on CPU") +endif() + +endfunction() + +GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_solver.h" "kernels/naive.h") +GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_solver.h" "kernels/naive.h") +GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_solver.h" "kernels/naive.h") + +GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_benchmark.h" "kernels/naive.h") +GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_benchmark.h" "kernels/naive.h") +GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_benchmark.h" "kernels/naive.h") + +GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_solver.h" "kernels/sharedKernel.h") +GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_solver.h" "kernels/sharedKernel.h") +GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_solver.h" "kernels/sharedKernel.h") + +GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_benchmark.h" "kernels/sharedKernel.h") +GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_benchmark.h" "kernels/sharedKernel.h") +GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_benchmark.h" "kernels/sharedKernel.h") + +GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_solver.h" "kernels/sharedData.h") +GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_solver.h" "kernels/sharedData.h") +GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_solver.h" "kernels/sharedData.h") + +GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_benchmark.h" "kernels/sharedData.h") +GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_benchmark.h" "kernels/sharedData.h") +GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_benchmark.h" "kernels/sharedData.h") + +GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_solver.h" "kernels/sharedDataAndKernel.h") +GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_solver.h" "kernels/sharedDataAndKernel.h") +GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_solver.h" "kernels/sharedDataAndKernel.h") + +GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_benchmark.h" "kernels/sharedDataAndKernel.h") +GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_benchmark.h" "kernels/sharedDataAndKernel.h") +GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_benchmark.h" "kernels/sharedDataAndKernel.h") + +GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "kernels/naive.h") +GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "kernels/sharedData.h") +GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "kernels/sharedKernel.h") +GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "kernels/sharedDataAndKernel.h") + +GENERATE_CUDA_EXECUTABLE("HeatEquation" 2 "templates/main_heat_equation_solver.h" "kernels/sharedDataAndKernel.h") diff --git a/src/Benchmarks/Convolution/kernels/naive.h b/src/Benchmarks/Convolution/kernels/naive.h new file mode 100644 index 0000000000000000000000000000000000000000..5705deb04be7089688bd0212d7c8c517bab56cc5 --- /dev/null +++ b/src/Benchmarks/Convolution/kernels/naive.h @@ -0,0 +1,351 @@ + +#pragma once + +#ifdef HAVE_CUDA + + #include + #include + #include + +/** + * There are several pitfalls with such configuration. + * + * 1. At first we don't use shared memory + * 2. At second we don't control block size, so we may launch extremely small kernels or otherwise we can launch extremely large + * kernels. + */ + +template< int Dimension, typename Device > +struct Convolution; + +template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename FetchKernel, + typename Convolve, + typename Store > +__global__ +static void +convolution1D( Index kernelWidth, + Index endX, + FetchData fetchData, + FetchBoundary fetchBoundary, + FetchKernel fetchKernel, + Convolve convolve, + Store store ) +{ + Index ix = threadIdx.x + blockIdx.x * blockDim.x; + + if( ix >= endX ) + return; + + Index radius = kernelWidth >> 1; + + Real result = 0; + + for( Index i = -radius; i <= radius; i++ ) { + Index elementIndex = i + ix; + Index kernelIndex = i + radius; + + if( elementIndex < 0 || elementIndex >= endX ) { + result = convolve( result, fetchBoundary( elementIndex ), fetchKernel( kernelIndex ) ); + } + else { + result = convolve( result, fetchData( elementIndex ), fetchKernel( kernelIndex ) ); + } + } + + store( ix, result ); +} + +template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename FetchKernel, + typename Convolve, + typename Store > +__global__ +static void +convolution2D( Index kernelWidth, + Index kernelHeight, + Index endX, + Index endY, + FetchData fetchData, + FetchBoundary fetchBoundary, + FetchKernel fetchKernel, + Convolve convolve, + Store store ) +{ + Index iy = threadIdx.y + blockIdx.y * blockDim.y; + Index ix = threadIdx.x + blockIdx.x * blockDim.x; + + if( ix >= endX || iy >= endY ) + return; + + Index radiusY = kernelHeight >> 1; + Index radiusX = kernelWidth >> 1; + + Real result = 0; + + for( Index j = -radiusY; j <= radiusY; j++ ) { + Index elementIndexY = j + iy; + Index kernelIndexY = j + radiusY; + + for( Index i = -radiusX; i <= radiusX; i++ ) { + Index elementIndexX = i + ix; + Index kernelIndexX = i + radiusX; + + if( elementIndexX < 0 || elementIndexX >= endX || elementIndexY < 0 || elementIndexY >= endY ) { + result = + convolve( result, fetchBoundary( elementIndexX, elementIndexY ), fetchKernel( kernelIndexX, kernelIndexY ) ); + } + else { + result = convolve( result, fetchData( elementIndexX, elementIndexY ), fetchKernel( kernelIndexX, kernelIndexY ) ); + } + } + } + + store( ix, iy, result ); +} + +template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename FetchKernel, + typename Convolve, + typename Store > +__global__ +static void +convolution3D( Index kernelWidth, + Index kernelHeight, + Index kernelDepth, + Index endX, + Index endY, + Index endZ, + FetchData fetchData, + FetchBoundary fetchBoundary, + FetchKernel fetchKernel, + Convolve convolve, + Store store ) +{ + Index iz = threadIdx.z + blockIdx.z * blockDim.z; + Index iy = threadIdx.y + blockIdx.y * blockDim.y; + Index ix = threadIdx.x + blockIdx.x * blockDim.x; + + if( ix >= endX || iy >= endY || iz >= endZ ) + return; + + Index radiusZ = kernelDepth >> 1; + Index radiusY = kernelHeight >> 1; + Index radiusX = kernelWidth >> 1; + + Real result = 0; + + for( Index k = -radiusZ; k <= radiusZ; k++ ) { + Index elementIndexZ = k + iz; + Index kernelIndexZ = k + radiusZ; + + for( Index j = -radiusY; j <= radiusY; j++ ) { + Index elementIndexY = j + iy; + Index kernelIndexY = j + radiusY; + + for( Index i = -radiusX; i <= radiusX; i++ ) { + Index elementIndexX = i + ix; + Index kernelIndexX = i + radiusX; + + if( elementIndexX < 0 || elementIndexX >= endX || elementIndexY < 0 || elementIndexY >= endY || elementIndexZ < 0 + || elementIndexZ >= endZ ) + { + result = convolve( result, + fetchBoundary( elementIndexX, elementIndexY, elementIndexZ ), + fetchKernel( kernelIndexX, kernelIndexY, kernelIndexZ ) ); + } + else { + result = convolve( result, + fetchData( elementIndexX, elementIndexY, elementIndexZ ), + fetchKernel( kernelIndexX, kernelIndexY, kernelIndexZ ) ); + } + } + } + } + + store( ix, iy, iz, result ); +} + +template< int Dimension, typename Device > +struct Convolution; + +template<> +struct Convolution< 1, TNL::Devices::Cuda > +{ +public: + template< typename Index > + using Vector = TNL::Containers::StaticVector< 1, Index >; + + template< typename Index, typename Real > + static void + setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) + { + configuration.dynamicSharedMemorySize = 0; + + // TODO: - Benchmark the best value + configuration.blockSize.x = kernelSize.x(); + configuration.gridSize.x = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); + } + + template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename FetchKernel, + typename Convolve, + typename Store > + static void + execute( const Vector< Index >& dimensions, + const Vector< Index >& kernelSize, + FetchData&& fetchData, + FetchBoundary&& fetchBoundary, + FetchKernel&& fetchKernel, + Convolve&& convolve, + Store&& store ) + { + TNL::Cuda::LaunchConfiguration configuration; + + setup< Index, Real >( configuration, dimensions, kernelSize ); + + constexpr auto kernel = convolution1D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; + + TNL::Cuda::launchKernel< true >( + kernel, 0, configuration, kernelSize.x(), dimensions.x(), fetchData, fetchBoundary, fetchKernel, convolve, store ); + }; +}; + +template<> +struct Convolution< 2, TNL::Devices::Cuda > +{ +public: + template< typename Index > + using Vector = TNL::Containers::StaticVector< 2, Index >; + + template< typename Index, typename Real > + static void + setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) + { + configuration.dynamicSharedMemorySize = 0; + + configuration.blockSize.x = kernelSize.x(); + configuration.blockSize.y = kernelSize.y(); + + configuration.gridSize.x = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); + configuration.gridSize.y = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); + } + + template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename FetchKernel, + typename Convolve, + typename Store > + static void + execute( const Vector< Index >& dimensions, + const Vector< Index >& kernelSize, + FetchData&& fetchData, + FetchBoundary&& fetchBoundary, + FetchKernel&& fetchKernel, + Convolve&& convolve, + Store&& store ) + { + TNL::Cuda::LaunchConfiguration configuration; + + setup< Index, Real >( configuration, dimensions, kernelSize ); + + constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; + + TNL::Cuda::launchKernel< true >( kernel, + 0, + configuration, + kernelSize.x(), + kernelSize.y(), + dimensions.x(), + dimensions.y(), + fetchData, + fetchBoundary, + fetchKernel, + convolve, + store ); + }; +}; + +template<> +struct Convolution< 3, TNL::Devices::Cuda > +{ +public: + template< typename Index > + using Vector = TNL::Containers::StaticVector< 3, Index >; + + template< typename Index, typename Real > + static void + setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) + { + configuration.dynamicSharedMemorySize = 0; + + // TODO: - Benchmark the best value + configuration.blockSize.x = kernelSize.x(); + configuration.blockSize.y = kernelSize.y(); + configuration.blockSize.z = kernelSize.z(); + + configuration.gridSize.x = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); + configuration.gridSize.y = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); + configuration.gridSize.z = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) ); + } + + template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename FetchKernel, + typename Convolve, + typename Store > + static void + execute( const Vector< Index >& dimensions, + const Vector< Index >& kernelSize, + FetchData&& fetchData, + FetchBoundary&& fetchBoundary, + FetchKernel&& fetchKernel, + Convolve&& convolve, + Store&& store ) + { + TNL::Cuda::LaunchConfiguration configuration; + + setup< Index, Real >( configuration, dimensions, kernelSize ); + + constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; + + TNL::Cuda::launchKernel< true >( kernel, + 0, + configuration, + kernelSize.x(), + kernelSize.y(), + kernelSize.z(), + dimensions.x(), + dimensions.y(), + dimensions.z(), + fetchData, + fetchBoundary, + fetchKernel, + convolve, + store ); + }; +}; + +#endif diff --git a/src/Benchmarks/Convolution/kernels/sharedData.h b/src/Benchmarks/Convolution/kernels/sharedData.h new file mode 100644 index 0000000000000000000000000000000000000000..f1dfb90086fb3b75fda3b2f325d7d39bb6cee6a9 --- /dev/null +++ b/src/Benchmarks/Convolution/kernels/sharedData.h @@ -0,0 +1,544 @@ +#pragma once + +#ifdef HAVE_CUDA + +/** + * This method stores image tile into shared memory + * and then calculates convolution. + * + * Thanks for the idea https://www.evl.uic.edu/sjames/cs525/final.html + */ + +#include +#include +#include +#include + +template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename FetchKernel, + typename Convolve, + typename Store > +__global__ +static void +convolution1D( Index kernelWidth, + Index endX, + FetchData fetchData, + FetchBoundary fetchBoundary, + FetchKernel fetchKernel, + Convolve convolve, + Store store ) +{ + Index ix = threadIdx.x + blockIdx.x * blockDim.x; + + Real* data = TNL::Cuda::getSharedMemory< Real >(); + Index radius = kernelWidth >> 1; + + // Left + Index lhs = ix - radius; + + if( lhs < 0 || lhs >= endX ) { + data[ threadIdx.x ] = fetchBoundary( lhs ); + } + else { + data[ threadIdx.x ] = fetchData( lhs ); + } + + // Right + Index rhs = ix + radius; + + if( rhs < 0 || rhs >= endX ) { + data[ threadIdx.x + blockDim.x ] = fetchBoundary( rhs ); + } + else { + data[ threadIdx.x + blockDim.x ] = fetchData( rhs ); + } + + __syncthreads(); + + if( ix >= endX ) + return; + + Real result = 0; + + #pragma unroll + for( Index i = 0; i < kernelWidth; i++ ) { + Index elementIndex = i + threadIdx.x; + + result = convolve( result, data[ elementIndex ], fetchKernel( i ) ); + } + + store( ix, result ); +} + +template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename FetchKernel, + typename Convolve, + typename Store > +__global__ +static void +convolution2D( Index kernelWidth, + Index kernelHeight, + Index endX, + Index endY, + FetchData fetchData, + FetchBoundary fetchBoundary, + FetchKernel fetchKernel, + Convolve convolve, + Store store ) +{ + Real* data = TNL::Cuda::getSharedMemory< Real >(); + + const Index iy = threadIdx.y + blockIdx.y * blockDim.y; + const Index ix = threadIdx.x + blockIdx.x * blockDim.x; + + const Index radiusY = kernelHeight >> 1; + const Index radiusX = kernelWidth >> 1; + + const Index dataBlockWidth = 2 * kernelWidth - 1; + const Index dataBlockHeight = 2 * kernelHeight - 1; + + const Index dataBlockRadiusX = dataBlockWidth >> 1; + const Index dataBlockRadiusY = dataBlockHeight >> 1; + + Index x, y, index; + + // Top Left + x = ix - radiusX; + y = iy - radiusY; + index = threadIdx.x + threadIdx.y * dataBlockWidth; + + if( x < 0 || y < 0 || x >= endX || y >= endY ) { + data[ index ] = fetchBoundary( x, y ); + } + else { + data[ index ] = fetchData( x, y ); + } + + // Top right + x = ix + radiusX; + y = iy - radiusY; + index = dataBlockRadiusX + threadIdx.x + threadIdx.y * dataBlockWidth; + + if( x < 0 || y < 0 || x >= endX || y >= endY ) { + data[ index ] = fetchBoundary( x, y ); + } + else { + data[ index ] = fetchData( x, y ); + } + + // Bottom Left + x = ix - radiusX; + y = iy + radiusY; + index = threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth; + + if(x < 0 || y < 0 || x >= endX || y >= endY ) { + data[ index ] = fetchBoundary( x, y ); + } + else { + data[ index ] = fetchData( x, y ); + } + + // Bottom Right + x = ix + radiusX; + y = iy + radiusY; + index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth; + + if( x < 0 || y < 0 || x >= endX || y >= endY ) { + data[ index ] = fetchBoundary( x, y ); + } + else { + data[ index ] = fetchData( x, y ); + } + + __syncthreads(); + + if( ix >= endX || iy >= endY ) + return; + + Real result = 0; + + for( Index j = 0; j < kernelHeight; j++ ) { + Index align = ( j + threadIdx.y ) * dataBlockWidth; + + for( Index i = 0; i < kernelWidth; i++ ) { + Index index = i + threadIdx.x + align; + + result = convolve( result, data[ index ], fetchKernel( i, j ) ); + } + } + + store( ix, iy, result ); +} + +template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename FetchKernel, + typename Convolve, + typename Store > +__global__ +static void +convolution3D( Index kernelWidth, + Index kernelHeight, + Index kernelDepth, + Index endX, + Index endY, + Index endZ, + FetchData fetchData, + FetchBoundary fetchBoundary, + FetchKernel fetchKernel, + Convolve convolve, + Store store ) +{ + Real* data = TNL::Cuda::getSharedMemory< Real >(); + + const Index ix = threadIdx.x + blockIdx.x * blockDim.x; + const Index iy = threadIdx.y + blockIdx.y * blockDim.y; + const Index iz = threadIdx.z + blockIdx.z * blockDim.z; + + const Index radiusX = kernelWidth >> 1; + const Index radiusY = kernelHeight >> 1; + const Index radiusZ = kernelDepth >> 1; + + const Index dataBlockWidth = 2 * kernelWidth - 1; + const Index dataBlockHeight = 2 * kernelHeight - 1; + const Index dataBlockDepth = 2 * kernelDepth - 1; + + const Index dataBlockXYVolume = dataBlockWidth * dataBlockHeight; + + const Index dataBlockRadiusX = dataBlockWidth >> 1; + const Index dataBlockRadiusY = dataBlockHeight >> 1; + const Index dataBlockRadiusZ = dataBlockDepth >> 1; + + Index x, y, z, index; + + // Z: 0 Y: 0 X: 0 + x = ix - radiusX; + y = iy - radiusY; + z = iz - radiusZ; + + index = threadIdx.x + threadIdx.y * dataBlockWidth + threadIdx.z * dataBlockXYVolume; + + if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { + data[ index ] = fetchBoundary( x, y, z ); + } + else { + data[ index ] = fetchData( x, y, z ); + } + + // Z: 0 Y: 0 X: 1 + x = ix + radiusX; + y = iy - radiusY; + z = iz - radiusZ; + + index = dataBlockRadiusX + threadIdx.x + threadIdx.y * dataBlockWidth + threadIdx.z * dataBlockXYVolume; + + if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { + data[ index ] = fetchBoundary( x, y, z ); + } + else { + data[ index ] = fetchData( x, y, z ); + } + + // Z: 0 Y: 1 X: 0 + x = ix - radiusX; + y = iy + radiusY; + z = iz - radiusZ; + + index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + threadIdx.z * dataBlockXYVolume; + + if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { + data[ index ] = fetchBoundary( x, y, z ); + } + else { + data[ index ] = fetchData( x, y, z ); + } + + // Z: 1 Y: 0 X: 0 + x = ix - radiusX; + y = iy - radiusY; + z = iz + radiusZ; + + index = threadIdx.x + threadIdx.y * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume; + + if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { + data[ index ] = fetchBoundary( x, y, z ); + } + else { + data[ index ] = fetchData( x, y, z ); + } + + // Z: 0 Y: 1 X: 1 + x = ix + radiusX; + y = iy + radiusY; + z = iz - radiusZ; + + index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + threadIdx.z * dataBlockXYVolume; + + if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { + data[ index ] = fetchBoundary( x, y, z ); + } + else { + data[ index ] = fetchData( x, y, z ); + } + + // Z: 1 Y: 0 X: 1 + x = ix + radiusX; + y = iy - radiusY; + z = iz + radiusZ; + + index = dataBlockRadiusX + threadIdx.x + threadIdx.y * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume; + + if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { + data[ index ] = fetchBoundary( x, y, z ); + } + else { + data[ index ] = fetchData( x, y, z ); + } + + // Z: 1 Y: 1 X: 0 + x = ix - radiusX; + y = iy + radiusY; + z = iz + radiusZ; + + index = threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume; + + if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { + data[ index ] = fetchBoundary( x, y, z ); + } + else { + data[ index ] = fetchData( x, y, z ); + } + + // Z: 1 Y: 1 X: 1 + x = ix + radiusX; + y = iy + radiusY; + z = iz + radiusZ; + + index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume; + + if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { + data[ index ] = fetchBoundary( x, y, z ); + } + else { + data[ index ] = fetchData( x, y, z ); + } + + __syncthreads(); + + if( ix >= endX || iy >= endY || iz >= endZ ) + return; + + Real result = 0; + + for( Index k = 0; k < kernelDepth; k++ ) { + Index xyAlign = ( k + threadIdx.z ) * dataBlockXYVolume; + + for( Index j = 0; j < kernelHeight; j++ ) { + Index xAlign = ( j + threadIdx.y ) * dataBlockWidth; + + for( Index i = 0; i < kernelWidth; i++ ) { + Index index = i + threadIdx.x + xAlign + xyAlign; + + result = convolve( result, data[ index ], fetchKernel( i, j, k ) ); + } + } + } + + store( ix, iy, iz, result ); +} + +template< int Dimension, typename Device > +struct Convolution; + +template<> +struct Convolution< 1, TNL::Devices::Cuda > +{ +public: + template< typename Index > + using Vector = TNL::Containers::StaticVector< 1, Index >; + + template< typename Index, typename Real > + static void + setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) + { + Index kernelElementCount = 1; + + for( Index i = 0; i < kernelSize.getSize(); i++ ) + kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1 ; + + configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real ); + + configuration.blockSize.x = kernelSize.x(); + configuration.gridSize.x = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); + } + + template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename FetchKernel, + typename Convolve, + typename Store > + static void + execute( const Vector< Index >& dimensions, + const Vector< Index >& kernelSize, + FetchData&& fetchData, + FetchBoundary&& fetchBoundary, + FetchKernel&& fetchKernel, + Convolve&& convolve, + Store&& store ) + { + TNL::Cuda::LaunchConfiguration configuration; + + setup< Index, Real >( configuration, dimensions, kernelSize ); + + constexpr auto kernel = convolution1D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; + + TNL::Cuda::launchKernel< true >( + kernel, 0, configuration, kernelSize.x(), dimensions.x(), fetchData, fetchBoundary, fetchKernel, convolve, store ); + }; +}; + +template<> +struct Convolution< 2, TNL::Devices::Cuda > +{ +public: + template< typename Index > + using Vector = TNL::Containers::StaticVector< 2, Index >; + + template< typename Index, typename Real > + static void + setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) + { + Index kernelElementCount = 1; + + for( Index i = 0; i < kernelSize.getSize(); i++ ) + kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1; + + configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real ); + + configuration.blockSize.x = kernelSize.x(); + configuration.blockSize.y = kernelSize.y(); + + configuration.gridSize.x = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); + configuration.gridSize.y = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); + } + + template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename FetchKernel, + typename Convolve, + typename Store > + static void + execute( const Vector< Index >& dimensions, + const Vector< Index >& kernelSize, + FetchData&& fetchData, + FetchBoundary&& fetchBoundary, + FetchKernel&& fetchKernel, + Convolve&& convolve, + Store&& store ) + { + TNL::Cuda::LaunchConfiguration configuration; + + setup< Index, Real >( configuration, dimensions, kernelSize ); + + constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; + + TNL::Cuda::launchKernel< true >( kernel, + 0, + configuration, + kernelSize.x(), + kernelSize.y(), + dimensions.x(), + dimensions.y(), + fetchData, + fetchBoundary, + fetchKernel, + convolve, + store ); + }; +}; + +template<> +struct Convolution< 3, TNL::Devices::Cuda > +{ +public: + template< typename Index > + using Vector = TNL::Containers::StaticVector< 3, Index >; + + template< typename Index, typename Real > + static void + setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) + { + Index kernelElementCount = 1; + + for( Index i = 0; i < kernelSize.getSize(); i++ ) + kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1; + + configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real ); + + configuration.blockSize.x = kernelSize.x(); + configuration.blockSize.y = kernelSize.y(); + configuration.blockSize.z = kernelSize.z(); + + configuration.gridSize.x = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); + configuration.gridSize.y = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); + configuration.gridSize.z = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) ); + } + + template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename FetchKernel, + typename Convolve, + typename Store > + static void + execute( const Vector< Index >& dimensions, + const Vector< Index >& kernelSize, + FetchData&& fetchData, + FetchBoundary&& fetchBoundary, + FetchKernel&& fetchKernel, + Convolve&& convolve, + Store&& store ) + { + TNL::Cuda::LaunchConfiguration configuration; + + setup< Index, Real >( configuration, dimensions, kernelSize ); + + constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; + + TNL::Cuda::launchKernel< true >( kernel, + 0, + configuration, + kernelSize.x(), + kernelSize.y(), + kernelSize.z(), + dimensions.x(), + dimensions.y(), + dimensions.z(), + fetchData, + fetchBoundary, + fetchKernel, + convolve, + store ); + }; +}; + +#endif diff --git a/src/Benchmarks/Convolution/kernels/sharedDataAndKernel.h b/src/Benchmarks/Convolution/kernels/sharedDataAndKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..b9d094203b8787fade33e15be996fc2a4277c66c --- /dev/null +++ b/src/Benchmarks/Convolution/kernels/sharedDataAndKernel.h @@ -0,0 +1,600 @@ +#pragma once + +#ifdef HAVE_CUDA + + #include + #include + #include + #include + +/** + * This method stores kernel and data in the data memory to reduce amount of loads. + * + * We can calculate the size of shared memory needed the next way: + * 1. We need to store in shared memory: + * * for 1D -> (2 * kernelWidth) - 1 < 2 * kernelWidth + * * for 2D -> ( (2 * kernelWidth) - 1 ) * ( (2 * kernelHeight) - 1 ) < 4 * kernelWidth * kernelHeight + * * for 3D -> ( (2 * kernelWidth) - 1 ) * ( (2 * kernelHeight) - 1 ) * ( (2 * kernelDepth) - 1 ) < 8 * kernelWidth * + * kernelHeight * kernelDepth + * 2. We take into account, that the maximal block size is 1024, so the maximum volume of kernel is 1024. + * Then the maximal amount of shared memory is: + * * for 1D -> 2 * 1024 -> 2048 elements (Note, that even if we take long double (16B) we still can fit in the shared + * memory) + * * for 2D -> 4 * 1024 -> 4096 elements + * * for 3D -> 8 * 1024 -> 8196 elements (Note, that if double takes 8 bytes, then we can't fit tile into shared memory, + * because we have 64 KB of data) + * 3. The last thing is, that even if we take 1D and 2D case we have enough space to store 1024 kernel element. + * Then the maximal amount of shared memory is: + * * for 1D -> 3 * 1024 -> can use long double, double, float + * * for 2D -> 5 * 1024 -> can use double, float + * * for 3D -> 9 * 1024 -> can use float + */ + +template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename FetchKernel, + typename Convolve, + typename Store > +__global__ +static void +convolution1D( Index kernelWidth, + Index endX, + FetchData fetchData, + FetchBoundary fetchBoundary, + FetchKernel fetchKernel, + Convolve convolve, + Store store ) +{ + Index ix = threadIdx.x + blockIdx.x * blockDim.x; + + Index kernelOffset = 2 * kernelWidth - 1; + + Real* data = TNL::Cuda::getSharedMemory< Real >(); + Real* kernel = data + kernelOffset; + + Index radius = kernelWidth >> 1; + + // Left + Index lhs = ix - radius; + + if( lhs < 0 || lhs >= endX ) { + data[ threadIdx.x ] = fetchBoundary( lhs ); + } + else { + data[ threadIdx.x ] = fetchData( lhs ); + } + + // Right + Index rhs = ix + radius; + + if( rhs < 0 || rhs >= endX ) { + data[ threadIdx.x + blockDim.x ] = fetchBoundary( rhs ); + } + else { + data[ threadIdx.x + blockDim.x ] = fetchData( rhs ); + } + + kernel[ threadIdx.x ] = fetchKernel( threadIdx.x ); + + __syncthreads(); + + if( ix >= endX ) + return; + + Real result = 0; + + #pragma unroll + for( Index i = 0; i < kernelWidth; i++ ) { + Index elementIndex = i + threadIdx.x; + + result = convolve( result, data[ elementIndex ], kernel[ i ] ); + } + + store( ix, result ); +} + +template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename FetchKernel, + typename Convolve, + typename Store > +__global__ +static void +convolution2D( Index kernelWidth, + Index kernelHeight, + Index endX, + Index endY, + FetchData fetchData, + FetchBoundary fetchBoundary, + FetchKernel fetchKernel, + Convolve convolve, + Store store ) +{ + const Index iy = threadIdx.y + blockIdx.y * blockDim.y; + const Index ix = threadIdx.x + blockIdx.x * blockDim.x; + + const Index radiusY = kernelHeight >> 1; + const Index radiusX = kernelWidth >> 1; + + const Index dataBlockWidth = 2 * kernelWidth - 1; + const Index dataBlockHeight = 2 * kernelHeight - 1; + + const Index dataBlockRadiusX = dataBlockWidth >> 1; + const Index dataBlockRadiusY = dataBlockHeight >> 1; + + const Index kernelOffset = dataBlockWidth * dataBlockHeight; + + Real* data = TNL::Cuda::getSharedMemory< Real >(); + Real* kernel = data + kernelOffset; + + Index x, y, index; + + // Top Left + x = ix - radiusX; + y = iy - radiusY; + index = threadIdx.x + threadIdx.y * dataBlockWidth; + + if( x < 0 || y < 0 || x >= endX || y >= endY ) { + data[ index ] = fetchBoundary( x, y ); + } + else { + data[ index ] = fetchData( x, y ); + } + + // Top right + x = ix + radiusX; + y = iy - radiusY; + index = dataBlockRadiusX + threadIdx.x + threadIdx.y * dataBlockWidth; + + if( x < 0 || y < 0 || x >= endX || y >= endY ) { + data[ index ] = fetchBoundary( x, y ); + } + else { + data[ index ] = fetchData( x, y ); + } + + // Bottom Left + x = ix - radiusX; + y = iy + radiusY; + index = threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth; + + if(x < 0 || y < 0 || x >= endX || y >= endY ) { + data[ index ] = fetchBoundary( x, y ); + } + else { + data[ index ] = fetchData( x, y ); + } + + // Bottom Right + x = ix + radiusX; + y = iy + radiusY; + index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth; + + if( x < 0 || y < 0 || x >= endX || y >= endY ) { + data[ index ] = fetchBoundary( x, y ); + } + else { + data[ index ] = fetchData( x, y ); + } + + index = threadIdx.x + threadIdx.y * blockDim.x; + + kernel[index] = fetchKernel( threadIdx.x, threadIdx.y ); + + __syncthreads(); + + if( ix >= endX || iy >= endY ) + return; + + Real result = 0; + + #pragma unroll + for( Index j = 0; j < kernelHeight; j++ ) { + Index elementAlign = ( j + threadIdx.y ) * dataBlockWidth; + Index kernelAlign = j * blockDim.x; + + #pragma unroll + for( Index i = 0; i < kernelWidth; i++ ) { + Index elementIndex = i + threadIdx.x + elementAlign; + Index kernelIndex = i + kernelAlign; + + result = convolve( result, data[ elementIndex ], kernel[ kernelIndex ] ); + } + } + + store( ix, iy, result ); +} + +template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename FetchKernel, + typename Convolve, + typename Store > +__global__ +static void +convolution3D( Index kernelWidth, + Index kernelHeight, + Index kernelDepth, + Index endX, + Index endY, + Index endZ, + FetchData fetchData, + FetchBoundary fetchBoundary, + FetchKernel fetchKernel, + Convolve convolve, + Store store ) +{ + const Index ix = threadIdx.x + blockIdx.x * blockDim.x; + const Index iy = threadIdx.y + blockIdx.y * blockDim.y; + const Index iz = threadIdx.z + blockIdx.z * blockDim.z; + + const Index radiusX = kernelWidth >> 1; + const Index radiusY = kernelHeight >> 1; + const Index radiusZ = kernelDepth >> 1; + + const Index dataBlockWidth = 2 * kernelWidth - 1; + const Index dataBlockHeight = 2 * kernelHeight - 1; + const Index dataBlockDepth = 2 * kernelDepth - 1; + + const Index dataBlockXYVolume = dataBlockWidth * dataBlockHeight; + + const Index dataBlockRadiusX = dataBlockWidth >> 1; + const Index dataBlockRadiusY = dataBlockHeight >> 1; + const Index dataBlockRadiusZ = dataBlockDepth >> 1; + + const Index kernelOffset = dataBlockWidth * dataBlockHeight * dataBlockDepth; + + Real* data = TNL::Cuda::getSharedMemory< Real >(); + Real* kernel = data + kernelOffset; + + Index x, y, z, index; + + // Z: 0 Y: 0 X: 0 + x = ix - radiusX; + y = iy - radiusY; + z = iz - radiusZ; + + index = threadIdx.x + threadIdx.y * dataBlockWidth + threadIdx.z * dataBlockXYVolume; + + if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { + data[ index ] = fetchBoundary( x, y, z ); + } + else { + data[ index ] = fetchData( x, y, z ); + } + + // Z: 0 Y: 0 X: 1 + x = ix + radiusX; + y = iy - radiusY; + z = iz - radiusZ; + + index = dataBlockRadiusX + threadIdx.x + threadIdx.y * dataBlockWidth + threadIdx.z * dataBlockXYVolume; + + if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { + data[ index ] = fetchBoundary( x, y, z ); + } + else { + data[ index ] = fetchData( x, y, z ); + } + + // Z: 0 Y: 1 X: 0 + x = ix - radiusX; + y = iy + radiusY; + z = iz - radiusZ; + + index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + threadIdx.z * dataBlockXYVolume; + + if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { + data[ index ] = fetchBoundary( x, y, z ); + } + else { + data[ index ] = fetchData( x, y, z ); + } + + // Z: 1 Y: 0 X: 0 + x = ix - radiusX; + y = iy - radiusY; + z = iz + radiusZ; + + index = threadIdx.x + threadIdx.y * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume; + + if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { + data[ index ] = fetchBoundary( x, y, z ); + } + else { + data[ index ] = fetchData( x, y, z ); + } + + // Z: 0 Y: 1 X: 1 + x = ix + radiusX; + y = iy + radiusY; + z = iz - radiusZ; + + index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + threadIdx.z * dataBlockXYVolume; + + if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { + data[ index ] = fetchBoundary( x, y, z ); + } + else { + data[ index ] = fetchData( x, y, z ); + } + + // Z: 1 Y: 0 X: 1 + x = ix + radiusX; + y = iy - radiusY; + z = iz + radiusZ; + + index = dataBlockRadiusX + threadIdx.x + threadIdx.y * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume; + + if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { + data[ index ] = fetchBoundary( x, y, z ); + } + else { + data[ index ] = fetchData( x, y, z ); + } + + // Z: 1 Y: 1 X: 0 + x = ix - radiusX; + y = iy + radiusY; + z = iz + radiusZ; + + index = threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume; + + if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { + data[ index ] = fetchBoundary( x, y, z ); + } + else { + data[ index ] = fetchData( x, y, z ); + } + + // Z: 1 Y: 1 X: 1 + x = ix + radiusX; + y = iy + radiusY; + z = iz + radiusZ; + + index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume; + + if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { + data[ index ] = fetchBoundary( x, y, z ); + } + else { + data[ index ] = fetchData( x, y, z ); + } + + index = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; + + kernel[index] = fetchKernel( threadIdx.x, threadIdx.y, threadIdx.z ); + + __syncthreads(); + + if( ix >= endX || iy >= endY || iz >= endZ ) + return; + + Real result = 0; + + #pragma unroll + for( Index k = 0; k < kernelDepth; k++ ) { + Index xyAlign = ( k + threadIdx.z ) * dataBlockXYVolume; + Index xyKernelAlign = k * blockDim.x * blockDim.y; + #pragma unroll + for( Index j = 0; j < kernelHeight; j++ ) { + Index xAlign = ( j + threadIdx.y ) * dataBlockWidth; + Index xKernelAlign = j * blockDim.x; + #pragma unroll + for( Index i = 0; i < kernelWidth; i++ ) { + Index elementIndex = i + threadIdx.x + xAlign + xyAlign; + Index kernelIndex = i + xKernelAlign + xyKernelAlign; + + result = convolve( result, data[ elementIndex ], kernel[ kernelIndex ] ); + } + } + } + + store( ix, iy, iz, result ); +} + +template< int Dimension, typename Device > +struct Convolution; + +template<> +struct Convolution< 1, TNL::Devices::Cuda > +{ +public: + template< typename Index > + using Vector = TNL::Containers::StaticVector< 1, Index >; + + template< typename Index, typename Real > + static void + setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) + { + Index kernelElementCount = 1; + + for( Index i = 0; i < kernelSize.getSize(); i++ ) + kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1; + + configuration.dynamicSharedMemorySize = ( kernelSize.x() + kernelElementCount ) * sizeof( Real ); + + configuration.blockSize.x = kernelSize.x(); + configuration.gridSize.x = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); + } + + template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename FetchKernel, + typename Convolve, + typename Store > + static void + execute( const Vector< Index >& dimensions, + const Vector< Index >& kernelSize, + FetchData&& fetchData, + FetchBoundary&& fetchBoundary, + FetchKernel&& fetchKernel, + Convolve&& convolve, + Store&& store ) + { + TNL::Cuda::LaunchConfiguration configuration; + + setup< Index, Real >( configuration, dimensions, kernelSize ); + + constexpr auto kernel = convolution1D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; + + cudaFuncSetCacheConfig(kernel, cudaFuncCachePreferShared); + + TNL::Cuda::launchKernel< true >( + kernel, 0, configuration, kernelSize.x(), dimensions.x(), fetchData, fetchBoundary, fetchKernel, convolve, store ); + }; +}; + +template<> +struct Convolution< 2, TNL::Devices::Cuda > +{ +public: + template< typename Index > + using Vector = TNL::Containers::StaticVector< 2, Index >; + + template< typename Index, typename Real > + static void + setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) + { + Index kernelElementCount = 1; + Index kernelVolume = 1; + + for( Index i = 0; i < kernelSize.getSize(); i++ ) { + kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1; + kernelVolume *= kernelSize[ i ]; + } + + configuration.dynamicSharedMemorySize = ( kernelVolume + kernelElementCount ) * sizeof( Real ); + + configuration.blockSize.x = kernelSize.x(); + configuration.blockSize.y = kernelSize.y(); + + configuration.gridSize.x = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); + configuration.gridSize.y = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); + } + + template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename FetchKernel, + typename Convolve, + typename Store > + static void + execute( const Vector< Index >& dimensions, + const Vector< Index >& kernelSize, + FetchData&& fetchData, + FetchBoundary&& fetchBoundary, + FetchKernel&& fetchKernel, + Convolve&& convolve, + Store&& store ) + { + TNL::Cuda::LaunchConfiguration configuration; + + setup< Index, Real >( configuration, dimensions, kernelSize ); + + constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; + + cudaFuncSetCacheConfig(kernel, cudaFuncCachePreferShared); + + TNL::Cuda::launchKernel< true >( kernel, + 0, + configuration, + kernelSize.x(), + kernelSize.y(), + dimensions.x(), + dimensions.y(), + fetchData, + fetchBoundary, + fetchKernel, + convolve, + store ); + }; +}; + +template<> +struct Convolution< 3, TNL::Devices::Cuda > +{ +public: + template< typename Index > + using Vector = TNL::Containers::StaticVector< 3, Index >; + + template< typename Index, typename Real > + static void + setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) + { + Index kernelElementCount = 1; + Index kernelVolume = 1; + + for( Index i = 0; i < kernelSize.getSize(); i++ ) { + kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1; + kernelVolume *= kernelSize[ i ]; + } + + configuration.dynamicSharedMemorySize = ( kernelVolume + kernelElementCount ) * sizeof( Real ); + + configuration.blockSize.x = kernelSize.x(); + configuration.blockSize.y = kernelSize.y(); + configuration.blockSize.z = kernelSize.z(); + + configuration.gridSize.x = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); + configuration.gridSize.y = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); + configuration.gridSize.z = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) ); + } + + template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename FetchKernel, + typename Convolve, + typename Store > + static void + execute( const Vector< Index >& dimensions, + const Vector< Index >& kernelSize, + FetchData&& fetchData, + FetchBoundary&& fetchBoundary, + FetchKernel&& fetchKernel, + Convolve&& convolve, + Store&& store ) + { + TNL::Cuda::LaunchConfiguration configuration; + + setup< Index, Real >( configuration, dimensions, kernelSize ); + + constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; + + cudaFuncSetCacheConfig(kernel, cudaFuncCachePreferShared); + + TNL::Cuda::launchKernel< true >( kernel, + 0, + configuration, + kernelSize.x(), + kernelSize.y(), + kernelSize.z(), + dimensions.x(), + dimensions.y(), + dimensions.z(), + fetchData, + fetchBoundary, + fetchKernel, + convolve, + store ); + }; +}; + +#endif diff --git a/src/Benchmarks/Convolution/kernels/sharedKernel.h b/src/Benchmarks/Convolution/kernels/sharedKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..ba98efe736cde1246c1b4f84aaeefaebfa22cf19 --- /dev/null +++ b/src/Benchmarks/Convolution/kernels/sharedKernel.h @@ -0,0 +1,394 @@ + +#pragma once + +#ifdef HAVE_CUDA + + #include + #include + #include + #include + +/** + * This method stores kernel in the shared memory to reduce amount of loads. + */ + +template< int Dimension, typename Device > +struct Convolution; + +template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename FetchKernel, + typename Convolve, + typename Store > +__global__ +static void +convolution1D( Index kernelWidth, + Index endX, + FetchData fetchData, + FetchBoundary fetchBoundary, + FetchKernel fetchKernel, + Convolve convolve, + Store store ) +{ + Index ix = threadIdx.x + blockIdx.x * blockDim.x; + + Real* shared = TNL::Cuda::getSharedMemory< Real >(); + + Index radius = kernelWidth >> 1; + + // The size of the block is equal to the kernel size + shared[ threadIdx.x ] = fetchKernel( threadIdx.x ); + + __syncthreads(); + + if( ix >= endX ) + return; + + Real result = 0; + + #pragma unroll + for( Index i = -radius; i <= radius; i++ ) { + Index elementIndex = i + ix; + Index kernelIndex = i + radius; + + if( elementIndex < 0 || elementIndex >= endX ) { + result = convolve( result, fetchBoundary( elementIndex ), shared[ kernelIndex ] ); + } + else { + result = convolve( result, fetchData( elementIndex ), shared[ kernelIndex ] ); + } + } + + store( ix, result ); +} + +template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename FetchKernel, + typename Convolve, + typename Store > +__global__ +static void +convolution2D( Index kernelWidth, + Index kernelHeight, + Index endX, + Index endY, + FetchData fetchData, + FetchBoundary fetchBoundary, + FetchKernel fetchKernel, + Convolve convolve, + Store store ) +{ + Index iy = threadIdx.y + blockIdx.y * blockDim.y; + Index ix = threadIdx.x + blockIdx.x * blockDim.x; + + Real* shared = TNL::Cuda::getSharedMemory< Real >(); + + Index radiusY = kernelHeight >> 1; + Index radiusX = kernelWidth >> 1; + + Index threadIndex = threadIdx.x + blockDim.x * threadIdx.y; + + // The size of the block is equal to the kernel size + shared[ threadIndex ] = fetchKernel( threadIdx.x, threadIdx.y ); + + __syncthreads(); + + if( ix >= endX || iy >= endY ) + return; + + Real result = 0; + + #pragma unroll + for( Index j = -radiusY; j <= radiusY; j++ ) { + Index elementIndexY = j + iy; + Index kernelIndexY = j + radiusY; + + #pragma unroll + for( Index i = -radiusX; i <= radiusX; i++ ) { + Index elementIndexX = i + ix; + Index kernelIndexX = i + radiusX; + + Index threadIndex = kernelIndexX + kernelWidth * kernelIndexY; + + if( elementIndexX < 0 || elementIndexX >= endX || elementIndexY < 0 || elementIndexY >= endY ) { + result = convolve( result, fetchBoundary( elementIndexX, elementIndexY ), shared[ threadIndex ] ); + } + else { + result = convolve( result, fetchData( elementIndexX, elementIndexY ), shared[ threadIndex ] ); + } + } + } + + store( ix, iy, result ); +} + +template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename FetchKernel, + typename Convolve, + typename Store > +__global__ +static void +convolution3D( Index kernelWidth, + Index kernelHeight, + Index kernelDepth, + Index endX, + Index endY, + Index endZ, + FetchData fetchData, + FetchBoundary fetchBoundary, + FetchKernel fetchKernel, + Convolve convolve, + Store store ) +{ + Index iz = threadIdx.z + blockIdx.z * blockDim.z; + Index iy = threadIdx.y + blockIdx.y * blockDim.y; + Index ix = threadIdx.x + blockIdx.x * blockDim.x; + + Real* shared = TNL::Cuda::getSharedMemory< Real >(); + + Index radiusZ = kernelDepth >> 1; + Index radiusY = kernelHeight >> 1; + Index radiusX = kernelWidth >> 1; + + Index threadIndex = threadIdx.x + blockDim.x * threadIdx.y + blockDim.x * blockDim.y * threadIdx.z; + + // The size of the block is equal to the kernel size + shared[ threadIndex ] = fetchKernel( threadIdx.x, threadIdx.y, threadIdx.z ); + + __syncthreads(); + + if( ix >= endX || iy >= endY || iz >= endZ ) + return; + + Real result = 0; + + #pragma unroll + for( Index k = -radiusZ; k <= radiusZ; k++ ) { + Index elementIndexZ = k + iz; + Index kernelIndexZ = k + radiusZ; + + #pragma unroll + for( Index j = -radiusY; j <= radiusY; j++ ) { + Index elementIndexY = j + iy; + Index kernelIndexY = j + radiusY; + + #pragma unroll + for( Index i = -radiusX; i <= radiusX; i++ ) { + Index elementIndexX = i + ix; + Index kernelIndexX = i + radiusX; + + Index threadIndex = kernelIndexX + kernelWidth * kernelIndexY + kernelWidth * kernelHeight * kernelIndexZ; + + if( elementIndexX < 0 || elementIndexX >= endX || elementIndexY < 0 || elementIndexY >= endY || elementIndexZ < 0 + || elementIndexZ >= endZ ) + { + result = convolve( result, fetchBoundary( elementIndexX, elementIndexY, elementIndexZ ), shared[ threadIndex ] ); + } + else { + result = convolve( result, fetchData( elementIndexX, elementIndexY, elementIndexZ ), shared[ threadIndex ] ); + } + } + } + } + + store( ix, iy, iz, result ); +} + +template<> +struct Convolution< 1, TNL::Devices::Cuda > +{ +public: + template< typename Index > + using Vector = TNL::Containers::StaticVector< 1, Index >; + + template< typename Index, typename Real > + static void + setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) + { + Index kernelElementCount = 1; + + for( Index i = 0; i < kernelSize.getSize(); i++ ) + kernelElementCount *= kernelSize[ i ]; + + configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real ); + + configuration.blockSize.x = kernelSize.x(); + configuration.gridSize.x = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); + } + + template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename FetchKernel, + typename Convolve, + typename Store > + static void + execute( const Vector< Index >& dimensions, + const Vector< Index >& kernelSize, + FetchData&& fetchData, + FetchBoundary&& fetchBoundary, + FetchKernel&& fetchKernel, + Convolve&& convolve, + Store&& store ) + { + TNL::Cuda::LaunchConfiguration configuration; + + setup< Index, Real >( configuration, dimensions, kernelSize ); + + constexpr auto kernel = convolution1D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; + + cudaFuncSetCacheConfig(kernel, cudaFuncCachePreferShared); + + TNL::Cuda::launchKernel< true >( + kernel, 0, configuration, kernelSize.x(), dimensions.x(), fetchData, fetchBoundary, fetchKernel, convolve, store ); + }; +}; + +template<> +struct Convolution< 2, TNL::Devices::Cuda > +{ +public: + template< typename Index > + using Vector = TNL::Containers::StaticVector< 2, Index >; + + template< typename Index, typename Real > + static void + setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) + { + Index kernelElementCount = 1; + + for( Index i = 0; i < kernelSize.getSize(); i++ ) + kernelElementCount *= kernelSize[ i ]; + + configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real ); + + configuration.blockSize.x = kernelSize.x(); + configuration.blockSize.y = kernelSize.y(); + + configuration.gridSize.x = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); + configuration.gridSize.y = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); + } + + template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename FetchKernel, + typename Convolve, + typename Store > + static void + execute( const Vector< Index >& dimensions, + const Vector< Index >& kernelSize, + FetchData&& fetchData, + FetchBoundary&& fetchBoundary, + FetchKernel&& fetchKernel, + Convolve&& convolve, + Store&& store ) + { + TNL::Cuda::LaunchConfiguration configuration; + + setup< Index, Real >( configuration, dimensions, kernelSize ); + + constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; + + cudaFuncSetCacheConfig(kernel, cudaFuncCachePreferShared); + + TNL::Cuda::launchKernel< true >( kernel, + 0, + configuration, + kernelSize.x(), + kernelSize.y(), + dimensions.x(), + dimensions.y(), + fetchData, + fetchBoundary, + fetchKernel, + convolve, + store ); + }; +}; + +template<> +struct Convolution< 3, TNL::Devices::Cuda > +{ +public: + template< typename Index > + using Vector = TNL::Containers::StaticVector< 3, Index >; + + template< typename Index, typename Real > + static void + setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) + { + Index kernelElementCount = 1; + + for( Index i = 0; i < kernelSize.getSize(); i++ ) + kernelElementCount *= kernelSize[ i ]; + + configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real ); + + configuration.blockSize.x = kernelSize.x(); + configuration.blockSize.y = kernelSize.y(); + configuration.blockSize.z = kernelSize.z(); + + configuration.gridSize.x = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); + configuration.gridSize.y = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); + configuration.gridSize.z = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) ); + } + + template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename FetchKernel, + typename Convolve, + typename Store > + static void + execute( const Vector< Index >& dimensions, + const Vector< Index >& kernelSize, + FetchData&& fetchData, + FetchBoundary&& fetchBoundary, + FetchKernel&& fetchKernel, + Convolve&& convolve, + Store&& store ) + { + TNL::Cuda::LaunchConfiguration configuration; + + setup< Index, Real >( configuration, dimensions, kernelSize ); + + constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; + + cudaFuncSetCacheConfig(kernel, cudaFuncCachePreferShared); + + TNL::Cuda::launchKernel< true >( kernel, + 0, + configuration, + kernelSize.x(), + kernelSize.y(), + kernelSize.z(), + dimensions.x(), + dimensions.y(), + dimensions.z(), + fetchData, + fetchBoundary, + fetchKernel, + convolve, + store ); + }; +}; + +#endif diff --git a/src/Benchmarks/Convolution/support/Benchmark.h b/src/Benchmarks/Convolution/support/Benchmark.h new file mode 100644 index 0000000000000000000000000000000000000000..b489000d4a952ae93cc6e27e5252fd4859d47a2a --- /dev/null +++ b/src/Benchmarks/Convolution/support/Benchmark.h @@ -0,0 +1,71 @@ + +#pragma once + +#include + +#include +#include + +#include +#include +#include + +template< int Dimension, typename Device > +class Benchmark +{ +public: + using TNLBenchmark = typename TNL::Benchmarks::Benchmark<>; + + void + run( const TNL::Config::ParameterContainer& parameters ) const + { + if( ! TNL::Devices::Cuda::setup( parameters ) ) + return; + + const TNL::String logFileName = parameters.getParameter< TNL::String >( "log-file" ); + const TNL::String outputMode = parameters.getParameter< TNL::String >( "output-mode" ); + + + const int verbose = parameters.getParameter< int >( "verbose" ); + const int loops = parameters.getParameter< int >( "loops" ); + + auto mode = std::ios::out; + + if( outputMode == "append" ) + mode |= std::ios::app; + + std::ofstream logFile( logFileName.getString(), mode ); + + TNLBenchmark benchmark( logFile, loops, verbose ); + + std::map< std::string, std::string > metadata = TNL::Benchmarks::getHardwareMetadata(); + TNL::Benchmarks::writeMapAsJson( metadata, logFileName, ".metadata.json" ); + + start(benchmark, parameters); + } + + virtual void start( TNLBenchmark& benchmark, const TNL::Config::ParameterContainer& parameters) const { + TNL_ASSERT_TRUE(false, "Should be overriden"); + } + + virtual TNL::Config::ConfigDescription makeInputConfig() const { + TNL::Config::ConfigDescription config; + + config.addDelimiter( "Benchmark settings:" ); + config.addEntry< TNL::String >( "id", "Identifier of the run", "unknown" ); + config.addEntry< TNL::String >( "log-file", "Log file name.", "output.log" ); + config.addEntry< TNL::String >( "output-mode", "Mode for opening the log file.", "overwrite" ); + config.addEntryEnum( "append" ); + config.addEntryEnum( "overwrite" ); + + config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 ); + config.addEntry< int >( "verbose", "Verbose mode.", 1 ); + + config.addDelimiter( "Device settings:" ); + +#ifdef HAVE_CUDA + TNL::Devices::Cuda::configSetup( config ); +#endif + return config; + } +}; diff --git a/src/Benchmarks/Convolution/support/DummyBenchmark.h b/src/Benchmarks/Convolution/support/DummyBenchmark.h new file mode 100644 index 0000000000000000000000000000000000000000..f8e2d4ae8113adac6849fb5ca05d80c9a1484cfd --- /dev/null +++ b/src/Benchmarks/Convolution/support/DummyBenchmark.h @@ -0,0 +1,140 @@ + +#pragma once + +#include "Benchmark.h" +#include "DummyTask.h" + +static std::vector< TNL::String > minDimensionIds = { "min-x-dimension", "min-y-dimension", "min-z-dimension" }; +static std::vector< TNL::String > dimensionIds = { "x-dimension", "y-dimension", "z-dimension" }; +static std::vector< TNL::String > maxDimensionIds = { "max-x-dimension", "max-y-dimension", "max-z-dimension" }; +static std::vector< TNL::String > minKernelSizeIds = { "min-kernel-width", "min-kernel-height", "min-kernel-depth" }; +static std::vector< TNL::String > kernelSizeIds = { "x-kernelSize", "y-kernelSize", "z-kernelSize" }; +static std::vector< TNL::String > maxKernelSizeIds = { "max-kernel-width", "max-kernel-height", "max-kernel-depth" }; + +template< int Dimension, typename Device > +class DummyBenchmark : public Benchmark< Dimension, Device > +{ +public: + using Vector = TNL::Containers::StaticVector< Dimension, int >; + using DataStore = TNL::Containers::Vector< float, Device, int >; + using Base = Benchmark< Dimension, Device >; + using TNLBenchmark = typename Base::TNLBenchmark; + + virtual void + start( TNLBenchmark& benchmark, const TNL::Config::ParameterContainer& parameters ) const override + { + Vector dimension; + Vector minKernelSize; + Vector maxKernelSize; + + for( int i = 0; i < Dimension; i++ ) { + dimension[ i ] = parameters.getParameter< int >( dimensionIds[ i ] ); + minKernelSize[ i ] = parameters.getParameter< int >( minKernelSizeIds[ i ] ); + maxKernelSize[ i ] = parameters.getParameter< int >( maxKernelSizeIds[ i ] ); + + TNL_ASSERT_GE( minKernelSize[ i ], 1, "Minimal kernel size must be a positive number" ); + TNL_ASSERT_EQ( minKernelSize[ i ] % 2, 1, "Minimal kernel size must be odd" ); + TNL_ASSERT_GT( maxKernelSize[ i ], minKernelSize[ i ], "End kernel size must be greater than start kernel size" ); + } + + int kernelStep = parameters.getParameter< int >( "kernel-step" ); + + TNL_ASSERT_GT( kernelStep, 0, "Kernel step must be a positive number" ); + TNL_ASSERT_EQ( kernelStep % 2, 0, "Kernel step must be even" ); + + TNL::String id = parameters.getParameter< TNL::String >( "id" ); + + time( id, benchmark, dimension, minKernelSize, maxKernelSize, kernelStep ); + } + + virtual void + time( const TNL::String& id, + TNLBenchmark& benchmark, + const Vector& dimension, + const Vector& minKernelSize, + const Vector& maxKernelSize, + const int kernelStep ) const + { + Vector currentKernelSize = minKernelSize; + + do { + timeConvolution( id, benchmark, dimension, currentKernelSize ); + + currentKernelSize[ 0 ] += kernelStep; + + for( size_t i = 0; i < currentKernelSize.getSize() - 1; i++ ) { + if( currentKernelSize[ i ] >= maxKernelSize[ i ] ) { + currentKernelSize[ i ] = minKernelSize[ i ]; + currentKernelSize[ i + 1 ] += kernelStep; + } + } + } while( currentKernelSize < maxKernelSize ); + } + + void + timeConvolution( const TNL::String& id, TNLBenchmark& benchmark, const Vector& dimension, const Vector& kernelSize ) const + { + auto device = TNL::getType< Device >(); + + typename TNLBenchmark::MetadataColumns columns = { { "id", id } }; + + size_t elementsCount = 1; + size_t kernelElementsCount = 1; + + for( size_t i = 0; i < dimension.getSize(); i++ ) { + elementsCount *= dimension[ i ]; + kernelElementsCount *= kernelSize[ i ]; + + columns.push_back( { dimensionIds[ i ], TNL::convertToString( dimension[ i ] ) } ); + columns.push_back( { kernelSizeIds[ i ], TNL::convertToString( kernelSize[ i ] ) } ); + } + + benchmark.setDatasetSize( ( elementsCount * 4 ) / 1.e9, 1.0 ); + benchmark.setMetadataColumns( columns ); + + // Setup input data + DataStore input, result, kernel; + + input.resize( elementsCount ); + result.resize( elementsCount ); + kernel.resize( kernelElementsCount ); + + input = 1; + result = 1; + kernel = 1; + + auto inputView = input.getConstView(); + auto kernelView = kernel.getConstView(); + auto resultView = result.getView(); + + auto measure = [ & ]() + { + DummyTask< int, float, Dimension, Device >::exec( dimension, kernelSize, inputView, resultView, kernelView ); + }; + + benchmark.template time< Device >( device, measure ); + } + + TNL::Config::ConfigDescription + makeInputConfig() const override + { + TNL::Config::ConfigDescription config = Base::makeInputConfig(); + + config.addDelimiter( "Grid dimension settings:" ); + + for( int i = 0; i < Dimension; i++ ) + config.addEntry< int >( dimensionIds[ i ], dimensionIds[ i ], 16 ); + + config.addDelimiter( "Kernel settings:" ); + + for( int i = 0; i < Dimension; i++ ) + config.addEntry< int >( minKernelSizeIds[ i ], minKernelSizeIds[ i ] + " (odd) :", 1 ); + + for( int i = 0; i < Dimension; i++ ) + config.addEntry< int >( maxKernelSizeIds[ i ], maxKernelSizeIds[ i ] + " (odd) :", 11 ); + + config.addEntry< int >( "kernel-step", "Step of kernel increase which is added to kernel (must be even)", 2 ); + + return config; + } +}; diff --git a/src/Benchmarks/Convolution/support/DummySolver.h b/src/Benchmarks/Convolution/support/DummySolver.h new file mode 100644 index 0000000000000000000000000000000000000000..2b1e60041236d6127b7a74483d3c8fb384d358e7 --- /dev/null +++ b/src/Benchmarks/Convolution/support/DummySolver.h @@ -0,0 +1,89 @@ + +#pragma once + +#include "Solver.h" +#include "DummyTask.h" + +static std::vector< TNL::String > dimensionIds = { "x-dimension", "y-dimension", "z-dimension" }; +static std::vector< TNL::String > kernelSizeIds = { "x-kernel-size", "y-kernel-size", "z-kernel-size" }; + +template< int Dimension, typename Device > +class DummySolver : public Solver< Dimension, Device > +{ +public: + using Base = Solver< Dimension, Device >; + using Vector = TNL::Containers::StaticVector< Dimension, int >; + using DataStore = TNL::Containers::Vector< float, Device, int >; + + virtual void + start( const TNL::Config::ParameterContainer& parameters ) const override + { + Vector dimensions; + Vector kernelSize; + + for( int i = 0; i < Dimension; i++ ) { + dimensions[ i ] = parameters.getParameter< int >( dimensionIds[ i ] ); + kernelSize[ i ] = parameters.getParameter< int >( kernelSizeIds[ i ] ); + + TNL_ASSERT_GT( dimensions[ i ], 1, "Start dimension must be positive integer" ); + + TNL_ASSERT_GE( kernelSize[ i ], 1, "Minimal kernel size must be a positive number" ); + TNL_ASSERT_EQ( kernelSize[ i ] % 2, 1, "Minimal kernel size must be odd" ); + } + + launchConvolution( dimensions, kernelSize ); + } + + void + launchConvolution( const Vector& dimension, const Vector& kernelSize ) const + { + DataStore input, result, kernel; + + size_t elementsCount = 1; + size_t kernelElementsCount = 1; + + for( size_t i = 0; i < (size_t) dimension.getSize(); i++ ) { + elementsCount *= dimension[ i ]; + kernelElementsCount *= kernelSize[ i ]; + } + + input.resize( elementsCount ); + result.resize( elementsCount ); + kernel.resize( kernelElementsCount ); + + input = 1; + result = 1; + kernel = 1; + + auto inputView = input.getConstView(); + auto kernelView = kernel.getConstView(); + auto resultView = result.getView(); + + DummyTask::exec(dimension, kernelSize, inputView, resultView, kernelView); + + TNL::Containers::Array< float, TNL::Devices::Host, int > host(result); + + for (int i = 0; i < host.getSize(); i++) + TNL_ASSERT_EQ(host[i], kernelElementsCount, "Dummy task always sets volume of kernel"); + + std::cout << "Everything is fine" << std::endl; + } + + virtual TNL::Config::ConfigDescription + makeInputConfig() const override + { + TNL::Config::ConfigDescription config = Base::makeInputConfig(); + + config.addDelimiter( "Grid dimension settings:" ); + + for( int i = 0; i < Dimension; i++ ) + config.addEntry< int >( dimensionIds[ i ], dimensionIds[ i ], 64 ); + + config.addDelimiter( "Kernel settings:" ); + + for( int i = 0; i < Dimension; i++ ) + config.addEntry< int >( kernelSizeIds[ i ], kernelSizeIds[ i ] + " (odd) :", 9 ); + + return config; + } +}; diff --git a/src/Benchmarks/Convolution/support/DummyTask.h b/src/Benchmarks/Convolution/support/DummyTask.h new file mode 100644 index 0000000000000000000000000000000000000000..07a58e98dc026565bea1f3e27ed95b395a7780ee --- /dev/null +++ b/src/Benchmarks/Convolution/support/DummyTask.h @@ -0,0 +1,187 @@ + +#pragma once + +template< int Dimension, typename Device > +struct Convolution +{ + template< typename Index > + using Vector = TNL::Containers::StaticVector< Dimension, Index >; + + template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename FetchKernel, + typename Convolve, + typename Store > + static void + execute( const Vector< Index >& dimensions, + const Vector< Index >& kernelSize, + FetchData&& fetchData, + FetchBoundary&& fetchBoundary, + FetchKernel&& fetchKernel, + Convolve&& convolve, + Store&& store ); +}; + +template< typename Index, typename Real, int Dimension, typename Device > +struct DummyTask; + +template< typename Index, typename Real > +struct DummyTask< Index, Real, 1, TNL::Devices::Cuda > +{ +public: + static constexpr int Dimension = 1; + using Device = TNL::Devices::Cuda; + using Vector = TNL::Containers::StaticVector< Dimension, Index >; + using ConstDataStore = typename TNL::Containers::Vector< Real, Device, Index >::ConstViewType; + using DataStore = typename TNL::Containers::Vector< Real, Device, Index >::ViewType; + using ConvolutionLauncher = Convolution< Dimension, Device >; + + static void + exec( const Vector& dimensions, const Vector& kernelSize, ConstDataStore& input, DataStore& result, ConstDataStore& kernel, int boundaryValue = 1 ) + { + auto fetchData = [ = ] __cuda_callable__( Index i ) + { + return input[ i ]; + }; + + auto fetchBoundary = [ = ] __cuda_callable__( Index i ) + { + return boundaryValue; + }; + + auto fetchKernel = [ = ] __cuda_callable__( Index i ) + { + return kernel[ i ]; + }; + + auto convolve = [ = ] __cuda_callable__( Real result, Real data, Real kernel ) + { + return result + data * kernel; + }; + + auto store = [ = ] __cuda_callable__( Index i, Real resultValue ) mutable + { + result[ i ] = resultValue; + }; + + ConvolutionLauncher::execute< Index, Real >( dimensions, + kernelSize, + std::forward< decltype( fetchData ) >( fetchData ), + std::forward< decltype( fetchBoundary ) >( fetchBoundary ), + std::forward< decltype( fetchKernel ) >( fetchKernel ), + std::forward< decltype( convolve ) >( convolve ), + std::forward< decltype( store ) >( store ) ); + } +}; + +template< typename Index, typename Real > +struct DummyTask< Index, Real, 2, TNL::Devices::Cuda > +{ +public: + static constexpr int Dimension = 2; + using Device = TNL::Devices::Cuda; + using Vector = TNL::Containers::StaticVector< Dimension, Index >; + using ConstDataStore = typename TNL::Containers::Vector< Real, Device, Index >::ConstViewType; + using DataStore = typename TNL::Containers::Vector< Real, Device, Index >::ViewType; + using ConvolutionLauncher = Convolution< Dimension, Device >; + + static void + exec( const Vector& dimensions, const Vector& kernelSize, ConstDataStore& input, DataStore& result, ConstDataStore& kernel, int boundaryValue = 1 ) + { + auto fetchData = [ = ] __cuda_callable__( Index i, Index j ) + { + auto index = i + j * dimensions.x(); + + return input[ index ]; + }; + + auto fetchBoundary = [ = ] __cuda_callable__( Index i, Index j ) + { + return boundaryValue; + }; + + auto fetchKernel = [ = ] __cuda_callable__( Index i, Index j ) + { + auto index = i + j * kernelSize.x(); + + return kernel[ index ]; + }; + + auto convolve = [ = ] __cuda_callable__( Real result, Real data, Real kernel ) + { + return result + data * kernel; + }; + + auto store = [ = ] __cuda_callable__( Index i, Index j, Real resultValue ) mutable + { + auto index = i + j * dimensions.x(); + + result[ index ] = resultValue; + }; + + ConvolutionLauncher::execute< Index, Real >( dimensions, + kernelSize, + std::forward< decltype( fetchData ) >( fetchData ), + std::forward< decltype( fetchBoundary ) >( fetchBoundary ), + std::forward< decltype( fetchKernel ) >( fetchKernel ), + std::forward< decltype( convolve ) >( convolve ), + std::forward< decltype( store ) >( store ) ); + } +}; + +template< typename Index, typename Real > +struct DummyTask< Index, Real, 3, TNL::Devices::Cuda > +{ +public: + static constexpr int Dimension = 3; + using Device = TNL::Devices::Cuda; + using Vector = TNL::Containers::StaticVector< Dimension, Index >; + using ConstDataStore = typename TNL::Containers::Vector< Real, Device, Index >::ConstViewType; + using DataStore = typename TNL::Containers::Vector< Real, Device, Index >::ViewType; + using ConvolutionLauncher = Convolution< Dimension, Device >; + + static void + exec( const Vector& dimensions, const Vector& kernelSize, ConstDataStore& input, DataStore& result, ConstDataStore& kernel, int boundaryValue = 1 ) + { + auto fetchData = [ = ] __cuda_callable__( Index i, Index j, Index k ) + { + auto index = i + j * dimensions.x() + k * dimensions.x() * dimensions.y(); + + return input[ index ]; + }; + + auto fetchBoundary = [ = ] __cuda_callable__( Index i, Index j, Index k ) + { + return boundaryValue; + }; + + auto fetchKernel = [ = ] __cuda_callable__( Index i, Index j, Index k ) + { + auto index = i + j * kernelSize.x() + k * kernelSize.x() * kernelSize.y(); + + return kernel[ index ]; + }; + + auto convolve = [ = ] __cuda_callable__( Real result, Real data, Real kernel ) + { + return result + data * kernel; + }; + + auto store = [ = ] __cuda_callable__( Index i, Index j, Index k, Real resultValue ) mutable + { + auto index = i + j * dimensions.x() + k * dimensions.x() * dimensions.y(); + + result[ index ] = resultValue; + }; + + ConvolutionLauncher::execute< Index, Real >( dimensions, + kernelSize, + std::forward< decltype( fetchData ) >( fetchData ), + std::forward< decltype( fetchBoundary ) >( fetchBoundary ), + std::forward< decltype( fetchKernel ) >( fetchKernel ), + std::forward< decltype( convolve ) >( convolve ), + std::forward< decltype( store ) >( store ) ); + } +}; diff --git a/src/Benchmarks/Convolution/support/HeatEquationSolver.h b/src/Benchmarks/Convolution/support/HeatEquationSolver.h new file mode 100644 index 0000000000000000000000000000000000000000..eebe89c4bec41b08be063f0e383774b5cac4f851 --- /dev/null +++ b/src/Benchmarks/Convolution/support/HeatEquationSolver.h @@ -0,0 +1,240 @@ + +#pragma once + +#include "Solver.h" +#include "DummyTask.h" + +#include +#include + +static std::vector< TNL::String > dimensionIds = { "grid-x-size", "grid-y-size" }; +static std::vector< TNL::String > kernelSizeIds = { "kernel-x-size", "kernel-y-size" }; +static std::vector< TNL::String > domainIds = { "domain-x-size", "domain-y-size" }; +static std::vector< TNL::String > kernelDomainIds = { "kernel-domain-x-size", "kernel-domain-y-size" }; + +static std::string alphaKey = "alpha"; +static std::string betaKey = "beta"; +static std::string gammaKey = "gamma"; + +static std::string timeStepKey = "timeStep"; +static std::string startTimeKey = "startTime"; +static std::string finalTimeKey = "finalTime"; +static std::string outputFilenamePrefix = "outputFilenamePrefix"; + +template< typename Real = double > +class HeatEquationSolver : public Solver< 2, TNL::Devices::Cuda > +{ +public: + constexpr static int Dimension = 2; + using Device = TNL::Devices::Cuda; + + using Base = Solver< Dimension, Device >; + using Vector = TNL::Containers::StaticVector< Dimension, int >; + using Point = TNL::Containers::StaticVector< Dimension, Real >; + using DataStore = TNL::Containers::Vector< Real, Device, int >; + using HostDataStore = TNL::Containers::Vector< Real, TNL::Devices::Host, int >; + + virtual void + start( const TNL::Config::ParameterContainer& parameters ) const override + { + int gridXSize = parameters.getParameter< int >( dimensionIds[ 0 ] ); + int gridYSize = parameters.getParameter< int >( dimensionIds[ 1 ] ); + + int kernelXSize = parameters.getParameter< int >( kernelSizeIds[ 0 ] ); + int kernelYSize = parameters.getParameter< int >( kernelSizeIds[ 1 ] ); + + Real xDomainSize = parameters.getParameter< Real >( domainIds[ 0 ] ); + Real yDomainSize = parameters.getParameter< Real >( domainIds[ 1 ] ); + + Real kernelXDomainSize = parameters.getParameter< Real >( kernelDomainIds[ 0 ] ); + Real kernelYDomainSize = parameters.getParameter< Real >( kernelDomainIds[ 1 ] ); + + Real hx = xDomainSize / (Real) gridXSize; + Real hy = yDomainSize / (Real) gridYSize; + + Point domain = { xDomainSize, yDomainSize }; + Point kernelDomain = { kernelXDomainSize, kernelYDomainSize }; + Point spaceSteps = { hx, hy }; + + Vector dimensions = { gridXSize, gridYSize }; + Vector kernelSize = { kernelXSize, kernelYSize }; + + DataStore function = prepareFunction( parameters, dimensions, domain, spaceSteps ); + + auto filenamePrefix = parameters.getParameter< TNL::String >( outputFilenamePrefix ); + auto initialFilename = filenamePrefix + "_initial.txt"; + + if( ! writeGNUPlot( initialFilename, dimensions, spaceSteps, domain, function.getConstView() ) ) { + std::cout << "Did fail during file write"; + return; + } + + DataStore result; + + result.setLike( function ); + result = 0; + + auto timeStep = parameters.getParameter< double >( timeStepKey ); + auto startTime = parameters.getParameter< double >( startTimeKey ); + auto finalTime = parameters.getParameter< double >( finalTimeKey ); + + int iteration = (startTime / timeStep) + 1; + int finalIteration = finalTime / timeStep; + + double time = iteration * timeStep; + + for (int i = iteration; i <= finalIteration; i++) { + printf("Time: %lf\n", time); + + convolve( dimensions, domain, kernelSize, kernelDomain, function.getConstView(), result.getView(), time ); + + auto filename = TNL::String("data_") + TNL::convertToString(i) + ".txt"; + + if( ! writeGNUPlot( filename, dimensions, spaceSteps, domain, result.getConstView() ) ) { + std::cout << "Did fail during file write"; + return; + } + + result = 0; + + time += timeStep; + } + } + + virtual TNL::Config::ConfigDescription + makeInputConfig() const override + { + TNL::Config::ConfigDescription config = Base::makeInputConfig(); + + config.addDelimiter( "Grid settings:" ); + config.addEntry< int >( dimensionIds[ 0 ], "Grid size along x-axis.", 200 ); + config.addEntry< int >( dimensionIds[ 1 ], "Grid size along y-axis.", 200 ); + + config.addDelimiter( "Kernel settings:" ); + config.addEntry< int >( kernelSizeIds[ 0 ], "Kernel size along x-axis.", 3 ); + config.addEntry< int >( kernelSizeIds[ 1 ], "Kernel size along y-axis.", 3 ); + + config.addDelimiter( "Problem settings:" ); + config.addEntry< TNL::String >( outputFilenamePrefix, "The prefix in name of the output file", "data" ); + + config.addEntry< Real >( domainIds[ 0 ], "Domain size along x-axis.", 4.0 ); + config.addEntry< Real >( domainIds[ 1 ], "Domain size along y-axis.", 4.0 ); + + config.addEntry< Real >( kernelDomainIds[ 0 ], "Kernel domain size along x-axis.", 4.0 ); + config.addEntry< Real >( kernelDomainIds[ 1 ], "Kernel domain size along y-axis.", 4.0 ); + + config.addDelimiter( "Initial condition settings ( (x^2/alpha + y^2/beta) + gamma)):" ); + config.addEntry< Real >( alphaKey, "Alpha value in initial condition", -0.05 ); + config.addEntry< Real >( betaKey, "Beta value in initial condition", -0.05 ); + config.addEntry< Real >( gammaKey, "Gamma key in initial condition", 15 ); + + config.addDelimiter( "Time settings:" ); + config.addEntry< Real >( startTimeKey, "Final time of the simulation.", 0.0); + config.addEntry< Real >( timeStepKey, "Time step of the simulation.", 0.005); + config.addEntry< Real >( finalTimeKey, "Final time of the simulation.", 0.36); + + return config; + } + + DataStore + prepareFunction( const TNL::Config::ParameterContainer& parameters, + const Vector& dimensions, + const Point& domain, + const Point& spaceSteps ) const + { + DataStore function; + + function.resize( dimensions.x() * dimensions.y() ); + + auto functionView = function.getView(); + + auto xDomainSize = parameters.getParameter< Real >( domainIds[ 0 ] ); + auto yDomainSize = parameters.getParameter< Real >( domainIds[ 1 ] ); + + auto alpha = parameters.getParameter< Real >( alphaKey ); + auto beta = parameters.getParameter< Real >( betaKey ); + auto gamma = parameters.getParameter< Real >( gammaKey ); + + auto init = [ = ] __cuda_callable__( int i, int j ) mutable + { + auto index = j * dimensions.x() + i; + + auto x = i * spaceSteps.x() - domain.x() / 2.; + auto y = j * spaceSteps.y() - domain.y() / 2.; + + functionView[ index ] = TNL::max((x * x / alpha) + (y * y / beta) + gamma, 0); + }; + + TNL::Algorithms::ParallelFor2D< Device >::exec( 0, 0, dimensions.x(), dimensions.y(), init ); + + return function; + } + + void + convolve( const Vector& dimensions, + const Point& domain, + const Vector& kernelSize, + const Point& kernelDomain, + typename DataStore::ConstViewType input, + typename DataStore::ViewType result, + const Real time ) const + { + DataStore kernel; + kernel.resize(kernelSize.x() * kernelSize.y()); + + auto kernelView = kernel.getView(); + auto domainSpaceSteps = Point(domain.x() / dimensions.x(), domain.y() / dimensions.y()); + auto kernelSpaceSteps = Point(kernelDomain.x() / (kernelSize.x() - 1), kernelDomain.y() / (kernelSize.y() - 1)); + + auto init = [ = ] __cuda_callable__( int i, int j ) mutable { + auto index = j * kernelSize.x() + i; + + auto x = i * kernelSpaceSteps.x() - kernelDomain.x() / 2.; + auto y = j * kernelSpaceSteps.y() - kernelDomain.y() / 2.; + + // The space step is given by the function domain + // However, because the kernel is limited to 31x31 size + // The user can specify it custom kernel domain from which values are taken + kernelView[ index ] = domainSpaceSteps.x() * domainSpaceSteps.y() * ( (Real)1 / ( (Real)4 * M_PI * time ) ) * exp( - ( pow(x, 2.) + pow(y, 2.) ) / ( (Real)4 * time ) ); + }; + + TNL::Algorithms::ParallelFor2D< Device >::exec( 0, 0, kernelSize.x(), kernelSize.y(), init ); + + // std::cout << std::endl << std::endl << std::endl; + + for (int i = 0; i < kernelSize.x(); i++) { + for (int j = 0; j < kernelSize.y(); j++) { + auto index = j * kernelSize.x() + i; + + printf("%lf ", kernelView.getElement(index)); + } + + printf("\n"); + } + + auto kernelConstView = kernel.getConstView(); + + DummyTask::exec(dimensions, kernelSize, input, result, kernelConstView, 0); + } + + bool + writeGNUPlot( const std::string& filename, + const Vector& dimensions, + const Point& spaceSteps, + const Point& domain, + const typename DataStore::ConstViewType& map ) const + { + std::ofstream out( filename, std::ios::out ); + + if( ! out.is_open() ) + return false; + + for( int j = 0; j < dimensions.y(); j++ ) + for( int i = 0; i < dimensions.x(); i++ ) + out << i * spaceSteps.x() - domain.x() / 2. << " " + << j * spaceSteps.y() - domain.y() / 2. << " " + << map.getElement( j * dimensions.x() + i ) << std::endl; + + return out.good(); + } +}; diff --git a/src/Benchmarks/Convolution/support/ImageSolver.h b/src/Benchmarks/Convolution/support/ImageSolver.h new file mode 100644 index 0000000000000000000000000000000000000000..069553573b0ae933e8d0e11786ba1090c873bc18 --- /dev/null +++ b/src/Benchmarks/Convolution/support/ImageSolver.h @@ -0,0 +1,245 @@ + +#pragma once + +#include "Solver.h" +#include "DummyTask.h" + +#include +#include +#include +#include +#include +#include +#include + +static std::vector< TNL::String > dimensionIds = { "x-dimension", "y-dimension", "z-dimension" }; +static std::vector< TNL::String > kernelSizeIds = { "kernel-x-size", "kernel-y-size", "kernel-z-size" }; +static std::vector< TNL::String > kernels = { "identity", "gauss3x3", "gauss5x5", + "sobelHorizontal", "sobelVertical", "edgeDetection" }; + +class ImageSolver : public Solver< 2, TNL::Devices::Cuda > +{ +public: + constexpr static int Dimension = 2; + using Device = TNL::Devices::Cuda; + + using Base = Solver< Dimension, Device >; + using Vector = TNL::Containers::StaticVector< Dimension, int >; + using DataStore = TNL::Containers::Vector< float, Device, int >; + using HostDataStore = TNL::Containers::Vector< float, TNL::Devices::Host, int >; + + using GridType = TNL::Meshes::Grid< 2, float, Device, int >; + using GridPointer = TNL::Pointers::SharedPointer< GridType >; + using MeshFunctionType = TNL::Functions::MeshFunction< GridType >; + + virtual void + start( const TNL::Config::ParameterContainer& parameters ) const override + { + GridPointer grid; + MeshFunctionType meshFunction; + TNL::Images::PNGImage< int > image; + TNL::Images::RegionOfInterest< int > roi; + + meshFunction.setMesh( grid ); + + auto output = parameters.getParameter< TNL::String >( "output" ); + + if( ! this->readImage( parameters, grid, meshFunction, image, roi ) || ! this->convolve( parameters, meshFunction ) + || ! this->write( parameters, image, meshFunction ) ) + return; + } + + virtual TNL::Config::ConfigDescription + makeInputConfig() const override + { + TNL::Config::ConfigDescription config = Base::makeInputConfig(); + + config.addDelimiter( "Image settings:" ); + + config.addEntry< TNL::String >( "input", "PNG image" ); + config.addEntry< TNL::String >( "output", "PNG image" ); + config.addEntry< TNL::String >( "kernel", "A kernel to apply", kernels[ 0 ] ); + + for( const auto& kernel : kernels ) + config.addEntryEnum( kernel ); + + config.addDelimiter( "Roi settings:" ); + + config.addEntry< int >( "roi-top", "Top (smaller number) line of the region of interest.", -1 ); + config.addEntry< int >( "roi-bottom", "Bottom (larger number) line of the region of interest.", -1 ); + config.addEntry< int >( "roi-left", "Left (smaller number) column of the region of interest.", -1 ); + config.addEntry< int >( "roi-right", "Right (larger number) column of the region of interest.", -1 ); + + return config; + } + + template< typename Image > + bool + readImage( const TNL::Config::ParameterContainer& parameters, + GridPointer& grid, + MeshFunctionType& meshFunction, + Image& image, + TNL::Images::RegionOfInterest< int >& roi ) const + { + auto input = parameters.getParameter< TNL::String >( "input" ); + + if( image.openForRead( input ) ) { + if( ! roi.setup( parameters, &image ) ) { + std::cout << "Invalid image roi."; + image.close(); + return false; + } + + std::cout << image.getWidth() << " " << image.getHeight() << std::endl; + + auto meshPointer = meshFunction.getMeshPointer(); + + meshPointer->setDimensions( image.getWidth(), image.getHeight() ); + + meshFunction.setMesh( meshPointer ); + + if( ! image.read( roi, meshFunction ) ) { + std::cout << "Invalid image size" << std::endl; + + image.close(); + return false; + } + + image.close(); + + std::cout << "Image read was successful: " << meshFunction.getData().getSize() << " elements count" << std::endl; + return true; + } + + std::cout << "Image open for read failed. Please check file path" << std::endl; + + return false; + } + + bool + convolve( const TNL::Config::ParameterContainer& parameters, MeshFunctionType& meshFunction ) const + { + auto imageData = meshFunction.getData().getConstView(); + + Vector kernelSize; + DataStore kernel; + + kernel = getKernel( parameters, kernelSize ); + + DataStore result; + + result.setLike( imageData ); + result = 0; + + TNL::Timer timer; + + timer.start(); + + std::cout << imageData.getSize() << " " << result.getSize() << std::endl; + + launchConvolution( + imageData, kernel.getConstView(), result.getView(), meshFunction.getMeshPointer()->getDimensions(), kernelSize ); + + timer.stop(); + + result.forAllElements( + [] __cuda_callable__( int i, float& value ) + { + value = TNL::max( TNL::min( value, 1.0 ), 0.0 ); + } ); + + meshFunction.getData() = result; + + std::cout << "Image convolution was successful. Time: " << timer.getRealTime() << " sec" << std::endl; + + return true; + } + + template< typename Image > + bool + write( const TNL::Config::ParameterContainer& parameters, Image& image, MeshFunctionType& meshFunction ) const + { + auto output = parameters.getParameter< TNL::String >( "output" ); + GridType grid = meshFunction.getMesh(); + + if( image.openForWrite( output, grid ) ) { + if( ! image.write( meshFunction ) ) { + std::cout << "Image write failed" << std::endl; + + image.close(); + return false; + } + + image.close(); + + return true; + } + + std::cout << "Image open for write failed. Please check file path" << std::endl; + + return false; + } + + HostDataStore + getKernel( const TNL::Config::ParameterContainer& parameters, Vector& kernelDimension ) const + { + auto kernel = parameters.getParameter< TNL::String >( "kernel" ); + + if( kernel == "identity" ) { + kernelDimension = { 3, 3 }; + + return { 0, 0, 0, 0, 1, 0, 0, 0, 0 }; + } + + if( kernel == "gauss3x3" ) { + kernelDimension = { 3, 3 }; + + HostDataStore kernel = { 1, 2, 1, 2, 4, 2, 1, 2, 1 }; + + kernel /= 16; + + return kernel; + } + + if( kernel == "gauss5x5" ) { + kernelDimension = { 5, 5 }; + + HostDataStore kernel = { 1, 4, 7, 4, 1, 4, 16, 26, 16, 4, 7, 26, 41, 26, 7, 4, 16, 26, 16, 4, 1, 4, 7, 4, 1 }; + + kernel /= 273; + + return kernel; + } + + if( kernel == "sobelHorizontal" ) { + kernelDimension = { 3, 3 }; + + return { 1, 2, 1, 0, 0, 0, -1, -2, -1 }; + } + + if( kernel == "sobelVertical" ) { + kernelDimension = { 3, 3 }; + + return { 1, 0, -1, 2, 0, -2, 1, 0, -1 }; + } + + if( kernel == "edgeDetection" ) { + kernelDimension = { 3, 3 }; + + return { -1, -1, -1, -1, 8, -1, -1, -1, -1 }; + } + + std::cout << "Unknown kernel " << kernel << ". Exit" << std::endl; + exit( 1 ); + } + + void + launchConvolution( DataStore::ConstViewType image, + DataStore::ConstViewType kernel, + DataStore::ViewType result, + const GridType::CoordinatesType& imageDimension, + const GridType::CoordinatesType& kernelDimension ) const + { + DummyTask< int, float, Dimension, Device >::exec( imageDimension, kernelDimension, image, result, kernel ); + } +}; diff --git a/src/Benchmarks/Convolution/support/Solver.h b/src/Benchmarks/Convolution/support/Solver.h new file mode 100644 index 0000000000000000000000000000000000000000..d80373f0df34d7b552adc3a5940e8b7b18386d01 --- /dev/null +++ b/src/Benchmarks/Convolution/support/Solver.h @@ -0,0 +1,41 @@ + +#pragma once + +#include + +#include +#include + +template< int Dimension, typename Device > +class Solver +{ +public: + void + solve( const TNL::Config::ParameterContainer& parameters ) const + { + if( ! TNL::Devices::Cuda::setup( parameters ) ) + return; + + start( parameters ); + } + + virtual void + start( const TNL::Config::ParameterContainer& parameters ) const + { + TNL_ASSERT_TRUE( false, "Should be overriden" ); + } + + virtual TNL::Config::ConfigDescription + makeInputConfig() const + { + TNL::Config::ConfigDescription config; + + config.addDelimiter( "Device settings:" ); + +#ifdef HAVE_CUDA + TNL::Devices::Cuda::configSetup( config ); +#endif + + return config; + } +}; diff --git a/src/Benchmarks/Convolution/templates/main_benchmark.h b/src/Benchmarks/Convolution/templates/main_benchmark.h new file mode 100644 index 0000000000000000000000000000000000000000..4be1e80e5b4494ea0e80da66e9357d3a771347d7 --- /dev/null +++ b/src/Benchmarks/Convolution/templates/main_benchmark.h @@ -0,0 +1,26 @@ + +#define KERNEL KERNEL_VALUE +#define DIMENSION DIMENSION_VALUE + +#include KERNEL_VALUE +#include "../support/DummyBenchmark.h" + +#include + +using TaskBenchmark = DummyBenchmark< DIMENSION, TNL::Devices::Cuda >; + +int main(int argc, char* argv[]) +{ + TaskBenchmark benchmark; + + auto config = benchmark.makeInputConfig(); + + TNL::Config::ParameterContainer parameters; + + if( ! parseCommandLine( argc, argv, config, parameters ) ) + return EXIT_FAILURE; + + benchmark.run( parameters ); + + return 0; +} diff --git a/src/Benchmarks/Convolution/templates/main_heat_equation_solver.h b/src/Benchmarks/Convolution/templates/main_heat_equation_solver.h new file mode 100644 index 0000000000000000000000000000000000000000..c258f074005746a4e78b8f466952bfe404415a77 --- /dev/null +++ b/src/Benchmarks/Convolution/templates/main_heat_equation_solver.h @@ -0,0 +1,26 @@ + +#define KERNEL KERNEL_VALUE +#define DIMENSION DIMENSION_VALUE + +#include KERNEL +#include "../support/HeatEquationSolver.h" + +#include + +using TaskSolver = HeatEquationSolver<>; + +int main(int argc, char* argv[]) +{ + TaskSolver solver; + + auto config = solver.makeInputConfig(); + + TNL::Config::ParameterContainer parameters; + + if( ! parseCommandLine( argc, argv, config, parameters ) ) + return EXIT_FAILURE; + + solver.solve( parameters ); + + return 0; +} diff --git a/src/Benchmarks/Convolution/templates/main_image_solver.h b/src/Benchmarks/Convolution/templates/main_image_solver.h new file mode 100644 index 0000000000000000000000000000000000000000..c2b9c1440c17c4f9c38f00393e85b19848b0c037 --- /dev/null +++ b/src/Benchmarks/Convolution/templates/main_image_solver.h @@ -0,0 +1,26 @@ + +#define KERNEL KERNEL_VALUE +#define DIMENSION DIMENSION_VALUE + +#include KERNEL +#include "../support/ImageSolver.h" + +#include + +using TaskSolver = ImageSolver; + +int main(int argc, char* argv[]) +{ + TaskSolver solver; + + auto config = solver.makeInputConfig(); + + TNL::Config::ParameterContainer parameters; + + if( ! parseCommandLine( argc, argv, config, parameters ) ) + return EXIT_FAILURE; + + solver.solve( parameters ); + + return 0; +} diff --git a/src/Benchmarks/Convolution/templates/main_solver.h b/src/Benchmarks/Convolution/templates/main_solver.h new file mode 100644 index 0000000000000000000000000000000000000000..ab2bc86990cbe2123d198d489d347dfe8444d372 --- /dev/null +++ b/src/Benchmarks/Convolution/templates/main_solver.h @@ -0,0 +1,27 @@ + +#define KERNEL KERNEL_VALUE +#define DIMENSION DIMENSION_VALUE + +#include KERNEL +#include "../support/DummySolver.h" + +#include + + +using TaskSolver = DummySolver< DIMENSION, TNL::Devices::Cuda >; + +int main(int argc, char* argv[]) +{ + TaskSolver solver; + + auto config = solver.makeInputConfig(); + + TNL::Config::ParameterContainer parameters; + + if( ! parseCommandLine( argc, argv, config, parameters ) ) + return EXIT_FAILURE; + + solver.solve( parameters ); + + return 0; +} diff --git a/src/TNL/Images/PNGImage_impl.h b/src/TNL/Images/PNGImage_impl.h index 3b946ff8276f5ea798f92dad5039adedd7417a73..5efd66c84b5ab05c8078eba910a2804c30679bc8 100644 --- a/src/TNL/Images/PNGImage_impl.h +++ b/src/TNL/Images/PNGImage_impl.h @@ -266,6 +266,7 @@ PNGImage< Index >::write( const Meshes::Grid< 2, Real, Device, Index >& grid, Ve for( j = 0; j < grid.getDimensions().x(); j++ ) { cell.getCoordinates().x() = j; cell.getCoordinates().y() = grid.getDimensions().y() - 1 - i; + cell.refresh(); // Index cellIndex = grid.getCellIndex( CoordinatesType( j, // grid.getDimensions().y() - 1 - i ) ); @@ -305,6 +306,7 @@ PNGImage< Index >::write( const Functions::MeshFunction< Meshes::Grid< 2, MeshRe for( j = 0; j < grid.getDimensions().x(); j++ ) { cell.getCoordinates().x() = j; cell.getCoordinates().y() = grid.getDimensions().y() - 1 - i; + cell.refresh(); // Index cellIndex = grid.getCellIndex( CoordinatesType( j, // grid.getDimensions().y() - 1 - i ) );