Commit 97284805 authored by Yury Hayeu's avatar Yury Hayeu
Browse files

Implement kernel, which loads data in shared memory

parent d3b6676e
Loading
Loading
Loading
Loading
+8 −0
Original line number Diff line number Diff line
@@ -38,3 +38,11 @@ GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_solver.h" "kernels/shar
GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_benchmark.h" "kernels/sharedKernel.h")
GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_benchmark.h" "kernels/sharedKernel.h")
GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_benchmark.h" "kernels/sharedKernel.h")

GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_solver.h" "kernels/sharedData.h")
GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_solver.h" "kernels/sharedData.h")
GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_solver.h" "kernels/sharedData.h")

GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_benchmark.h" "kernels/sharedData.h")
GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_benchmark.h" "kernels/sharedData.h")
GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_benchmark.h" "kernels/sharedData.h")
+432 −0
Original line number Diff line number Diff line
#pragma once

#ifdef HAVE_CUDA

/**
 * This method stores image tile into shared memory
 * and then calculates convolution.
 *
 * Thanks for the idea  https://www.evl.uic.edu/sjames/cs525/final.html
 */

   #include <TNL/Devices/Cuda.h>
   #include <TNL/Containers/StaticVector.h>
   #include <TNL/Cuda/LaunchHelpers.h>
   #include <TNL/Cuda/SharedMemory.h>

template< int Dimension, typename Device >
struct Convolution;

template<>
struct Convolution< 1, TNL::Devices::Cuda >
{
public:
   template< typename Index >
   using Vector = TNL::Containers::StaticVector< 1, Index >;

   template< typename Index, typename Real >
   static void
   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
   {
      Index kernelElementCount = 1;

      for( Index i = 0; i < kernelSize.getSize(); i++ )
         kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1;

      configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real );

      configuration.blockSize.x = kernelSize.x();
      configuration.gridSize.x =
         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
   }
};

template< typename Index,
          typename Real,
          typename FetchData,
          typename FetchBoundary,
          typename FetchKernel,
          typename Convolve,
          typename Store >
__global__
static void
convolution1D( Index kernelWidth,
               Index endX,
               FetchData fetchData,
               FetchBoundary fetchBoundary,
               FetchKernel fetchKernel,
               Convolve convolve,
               Store store )
{
   Index ix = threadIdx.x + blockIdx.x * blockDim.x;

   if( ix >= endX )
      return;

   Real* shared = TNL::Cuda::getSharedMemory< Real >();
   Index radius = kernelWidth >> 1;

   // Left
   Index lhs = ix - radius;

   if( lhs < 0 ) {
      shared[ threadIdx.x ] = fetchBoundary( lhs );
   }
   else {
      shared[ threadIdx.x ] = fetchData( lhs );
   }

   // Right
   Index rhs = ix + radius;

   if( rhs >= endX ) {
      shared[ threadIdx.x + blockDim.x ] = fetchBoundary( rhs );
   }
   else {
      shared[ threadIdx.x + blockDim.x ] = fetchData( rhs );
   }

   __syncthreads();

   Real result = 0;

   #pragma unroll
   for( Index i = 0; i < kernelWidth; i++ ) {
      Index elementIndex = i + threadIdx.x;

      result = convolve( result, shared[ elementIndex ], fetchKernel( i ) );
   }

   store( ix, result );
}

template<>
struct Convolution< 2, TNL::Devices::Cuda >
{
public:
   template< typename Index >
   using Vector = TNL::Containers::StaticVector< 2, Index >;

   template< typename Index, typename Real >
   static void
   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
   {
      Index kernelElementCount = 1;

      for( Index i = 0; i < kernelSize.getSize(); i++ )
         kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1;

      configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real );

      configuration.blockSize.x = kernelSize.x();
      configuration.blockSize.y = kernelSize.y();

      configuration.gridSize.x =
         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
      configuration.gridSize.y =
         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
   }
};

template< typename Index,
          typename Real,
          typename FetchData,
          typename FetchBoundary,
          typename FetchKernel,
          typename Convolve,
          typename Store >
__global__
static void
convolution2D( Index kernelWidth,
               Index kernelHeight,
               Index endX,
               Index endY,
               FetchData fetchData,
               FetchBoundary fetchBoundary,
               FetchKernel fetchKernel,
               Convolve convolve,
               Store store )
{
   Index iy = threadIdx.y + blockIdx.y * blockDim.y;
   Index ix = threadIdx.x + blockIdx.x * blockDim.x;

   if( ix >= endX || iy >= endY )
      return;

   Real* shared = TNL::Cuda::getSharedMemory< Real >();

   Index radiusY = kernelHeight >> 1;
   Index radiusX = kernelWidth >> 1;

   Index x, y, index;

   // Top Left
   x = ix - radiusX;
   y = iy - radiusY;

   index = threadIdx.x + threadIdx.y * blockDim.x;

   if( x < 0 || y < 0 ) {
      shared[ index ] = fetchBoundary( x, y );
   }
   else {
      shared[ index ] = fetchData( x, y );
   }

   // Top right
   x = ix + radiusX;
   y = iy - radiusY;

   index = radiusX + threadIdx.x + threadIdx.y * blockDim.x;

   if( x >= endX || y < 0 ) {
      shared[ index ] = fetchBoundary( x, y );
   }
   else {
      shared[ index ] = fetchData( x, y );
   }

   // Bottom Left
   x = ix - radiusX;
   y = iy + radiusY;

   index = threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.x;

   if( x < 0 || y >= endY ) {
      shared[ index ] = fetchBoundary( x, y );
   }
   else {
      shared[ index ] = fetchData( x, y );
   }

   // Bottom Right
   x = ix + radiusX;
   y = iy + radiusY;

   index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.x;

   if( x >= endX || y >= endY ) {
      shared[ index ] = fetchBoundary( x, y );
   }
   else {
      shared[ index ] = fetchData( x, y );
   }

   __syncthreads();

   Real result = 0;

   for( Index j = 0; j <= radiusY; j++ ) {
      Index align = ( j + threadIdx.y ) * blockDim.y;

      for( Index i = 0; i <= radiusX; i++ ) {
         Index index = i + threadIdx.x + align;

         result = convolve( result, shared[ index ], fetchKernel( i, j ) );
      }
   }

   store( ix, iy, result );
}

template<>
struct Convolution< 3, TNL::Devices::Cuda >
{
public:
   template< typename Index >
   using Vector = TNL::Containers::StaticVector< 3, Index >;

   template< typename Index, typename Real >
   static void
   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
   {
      Index kernelElementCount = 1;

      for( Index i = 0; i < kernelSize.getSize(); i++ )
         kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1;

      configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real );

      configuration.blockSize.x = kernelSize.x();
      configuration.blockSize.y = kernelSize.y();
      configuration.blockSize.z = kernelSize.z();

      configuration.gridSize.x =
         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
      configuration.gridSize.y =
         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
      configuration.gridSize.y =
         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) );
   }
};

template< typename Index,
          typename Real,
          typename FetchData,
          typename FetchBoundary,
          typename FetchKernel,
          typename Convolve,
          typename Store >
__global__
static void
convolution3D( Index kernelWidth,
               Index kernelHeight,
               Index kernelDepth,
               Index endX,
               Index endY,
               Index endZ,
               FetchData fetchData,
               FetchBoundary fetchBoundary,
               FetchKernel fetchKernel,
               Convolve convolve,
               Store store )
{
   Index iz = threadIdx.z + blockIdx.z * blockDim.z;
   Index iy = threadIdx.y + blockIdx.y * blockDim.y;
   Index ix = threadIdx.x + blockIdx.x * blockDim.x;

   if( ix >= endX || iy >= endY || iz >= endZ )
      return;

   Real* shared = TNL::Cuda::getSharedMemory< Real >();

   Index radiusZ = kernelDepth >> 1;
   Index radiusY = kernelHeight >> 1;
   Index radiusX = kernelWidth >> 1;

   Index x, y, z, index;

   // Z: 0 Y: 0 X: 0
   x = ix - radiusX;
   y = iy - radiusY;
   z = iz - radiusZ;

   index = threadIdx.x + threadIdx.y * blockDim.y + threadIdx.z * blockDim.x * blockDim.y;

   if( x < 0 || y < 0 || z < 0 ) {
      shared[ index ] = fetchBoundary( x, y, z );
   }
   else {
      shared[ index ] = fetchData( x, y, z );
   }

   // Z: 0 Y: 0 X: 1
   x = ix + radiusX;
   y = iy - radiusY;
   z = iz - radiusZ;

   index = radiusX + threadIdx.x + threadIdx.y * blockDim.y + threadIdx.z * blockDim.x * blockDim.y;

   if( x >= endX || y < 0 || z < 0 ) {
      shared[ index ] = fetchBoundary( x, y, z );
   }
   else {
      shared[ index ] = fetchData( x, y, z );
   }

   // Z: 0 Y: 1 X: 0
   x = ix - radiusX;
   y = iy + radiusY;
   z = iz - radiusZ;

   index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + threadIdx.z * blockDim.x * blockDim.y;

   if( x < 0 || y >= endY || z < 0 ) {
      shared[ index ] = fetchBoundary( x, y, z );
   }
   else {
      shared[ index ] = fetchData( x, y, z );
   }

   // Z: 1 Y: 0 X: 0
   x = ix - radiusX;
   y = iy - radiusY;
   z = iz + radiusZ;

   index = threadIdx.x + threadIdx.y * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y;

   if( x < 0 || y < 0 || z >= endZ ) {
      shared[ index ] = fetchBoundary( x, y, z );
   }
   else {
      shared[ index ] = fetchData( x, y, z );
   }

   // Z: 0 Y: 1 X: 1
   x = ix + radiusX;
   y = iy + radiusY;
   z = iz - radiusZ;

   index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + threadIdx.z * blockDim.x * blockDim.y;

   if( x >= endX || y >= endY || z < 0 ) {
      shared[ index ] = fetchBoundary( x, y, z );
   }
   else {
      shared[ index ] = fetchData( x, y, z );
   }

   // Z: 1 Y: 0 X: 1
   x = ix + radiusX;
   y = iy - radiusY;
   z = iz + radiusZ;

   index = radiusX + threadIdx.x + threadIdx.y * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y;

   if( x >= endX || y < 0 || z >= endZ ) {
      shared[ index ] = fetchBoundary( x, y, z );
   }
   else {
      shared[ index ] = fetchData( x, y, z );
   }

   // Z: 1 Y: 1 X: 0
   x = ix - radiusX;
   y = iy + radiusY;
   z = iz + radiusZ;

   index = threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y;

   if( x < 0 || y >= endY || z >= endZ ) {
      shared[ index ] = fetchBoundary( x, y, z );
   }
   else {
      shared[ index ] = fetchData( x, y, z );
   }

   // Z: 1 Y: 1 X: 1
   x = ix + radiusX;
   y = iy + radiusY;
   z = iz + radiusZ;

   index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y;

   if( x >= endX || y >= endY || z >= endZ ) {
      shared[ index ] = fetchBoundary( x, y, z );
   }
   else {
      shared[ index ] = fetchData( x, y, z );
   }

   __syncthreads();

   Real result = 0;

   for( Index k = 0; k <= radiusZ; k++ ) {
      Index xyAlign = ( k + threadIdx.z ) * blockDim.y * blockDim.x;

      for( Index j = 0; j <= radiusY; j++ ) {
         Index xAlign = ( j + threadIdx.y ) * blockDim.y;

         for( Index i = 0; i <= radiusX; i++ ) {
            Index index = i + threadIdx.x + xAlign + xyAlign;

            result = convolve( result, shared[ index ], fetchKernel( i, j, k ) );
         }
      }
   }

   store( ix, iy, iz, result );
}

#endif
+26 −16
Original line number Diff line number Diff line
@@ -8,6 +8,10 @@
   #include <TNL/Cuda/LaunchHelpers.h>
   #include <TNL/Cuda/SharedMemory.h>

/**
 * This method stores kernel in the shared memory to reduce amount of loads.
 */

template< int Dimension, typename Device >
struct Convolution;

@@ -52,10 +56,14 @@ convolution1D( Index kernelWidth,
               Convolve convolve,
               Store store )
{
   Index ix = threadIdx.x + blockIdx.x * blockDim.x;

   if( ix >= endX )
      return;

   Real* shared = TNL::Cuda::getSharedMemory< Real >();

   Index radius = kernelWidth >> 1;
   Index ix = threadIdx.x + blockIdx.x * blockDim.x;

   // The size of the block is equal to the kernel size
   shared[ threadIdx.x ] = fetchKernel( threadIdx.x );
@@ -126,14 +134,17 @@ convolution2D( Index kernelWidth,
               Convolve convolve,
               Store store )
{
   Index iy = threadIdx.y + blockIdx.y * blockDim.y;
   Index ix = threadIdx.x + blockIdx.x * blockDim.x;

   if( ix >= endX || iy >= endY )
      return;

   Real* shared = TNL::Cuda::getSharedMemory< Real >();

   Index radiusY = kernelHeight >> 1;
   Index radiusX = kernelWidth >> 1;

   Index iy = threadIdx.y + blockIdx.y * blockDim.y;
   Index ix = threadIdx.x + blockIdx.x * blockDim.x;

   Index threadIndex = threadIdx.x + blockDim.x * threadIdx.y;

   // The size of the block is equal to the kernel size
@@ -217,12 +228,15 @@ convolution3D( Index kernelWidth,
               Convolve convolve,
               Store store )
{
   Real* shared = TNL::Cuda::getSharedMemory< Real >();

   Index iz = threadIdx.z + blockIdx.z * blockDim.z;
   Index iy = threadIdx.y + blockIdx.y * blockDim.y;
   Index ix = threadIdx.x + blockIdx.x * blockDim.x;

   if( ix >= endX || iy >= endY || iz >= endZ )
      return;

   Real* shared = TNL::Cuda::getSharedMemory< Real >();

   Index radiusZ = kernelDepth >> 1;
   Index radiusY = kernelHeight >> 1;
   Index radiusX = kernelWidth >> 1;
@@ -255,14 +269,10 @@ convolution3D( Index kernelWidth,
            if( elementIndexX < 0 || elementIndexX >= endX || elementIndexY < 0 || elementIndexY >= endY || elementIndexZ < 0
                || elementIndexZ >= endZ )
            {
               result = convolve( result,
                                  fetchBoundary( elementIndexX, elementIndexY, elementIndexZ ),
                                  shared[threadIndex] );
               result = convolve( result, fetchBoundary( elementIndexX, elementIndexY, elementIndexZ ), shared[ threadIndex ] );
            }
            else {
               result = convolve( result,
                                  fetchData( elementIndexX, elementIndexY, elementIndexZ ),
                                  shared[threadIndex] );
               result = convolve( result, fetchData( elementIndexX, elementIndexY, elementIndexZ ), shared[ threadIndex ] );
            }
         }
      }