Loading src/Benchmarks/Convolution/CMakeLists.txt +1 −1 Original line number Diff line number Diff line Loading @@ -13,7 +13,7 @@ if (${BUILD_CUDA}) FILE(WRITE ${SOURCE_FILE} "${TEMPLATE_CONTENT}") SET(EXECUTABLE_NAME "${PREFIX}_${DIMENSION}_${MODULE_NAME}") SET(EXECUTABLE_NAME "${PREFIX}_${DIMENSION}_${MODULE_NAME}_${TEMPLATE_NAME}") CUDA_ADD_EXECUTABLE(${EXECUTABLE_NAME} ${SOURCE_FILE}) else() Loading src/Benchmarks/Convolution/kernels/naive.h +78 −55 Original line number Diff line number Diff line Loading @@ -37,6 +37,10 @@ convolution1D( Index kernelWidth, Store store ) { Index ix = threadIdx.x + blockIdx.x * blockDim.x; if (ix >= endX) return; Index radius = kernelWidth >> 1; Real result = 0; Loading Loading @@ -90,8 +94,11 @@ convolution2D( Index kernelWidth, Index iy = threadIdx.y + blockIdx.y * blockDim.y; Index ix = threadIdx.x + blockIdx.x * blockDim.x; if (ix >= endX || iy >= endY) return; Index radiusY = kernelHeight >> 1; Index radiusX = kernelHeight >> 1; Index radiusX = kernelWidth >> 1; Real result = 0; Loading @@ -115,59 +122,75 @@ convolution2D( Index kernelWidth, store( ix, iy, result ); } // template<> // struct Convolution< 3, TNL::Devices::Cuda > // { // public: // template< typename Index > // static size_t // getDynamicSharedMemorySize( Index kernelWidth, Index kernelHeight, Index kernelDepth, Index endX, Index endY, Index endZ ) // { // return 0; // } // }; // template< typename Index, // typename Real, // typename FetchData, // typename FetchBoundary, // typename FetchKernel, // typename Convolve, // typename Store > // __global__ // static void // convolution3D( Index kernelWidth, // Index kernelHeight, // Index kernelDepth, // Index endX, // Index endY, // Index endZ, // FetchData& fetchData, // FetchBoundary& fetchBoundary, // FetchKernel& fetchKernel, // Convolve& convolve, // Store& store ) // { // int ix = threadIdx.x + blockIdx.x * blockDim.x; // int iy = threadIdx.y + blockIdx.y * blockDim.y; // int iz = threadIdx.z + blockIdx.z * blockDim.z; // Real result = 0; // for( Index k = iz - kernelDepth; k <= iz + kernelDepth; k++ ) { // for( Index j = iy - kernelHeight; j <= iy + kernelHeight; j++ ) { // for( Index i = ix - kernelWidth; i <= ix + kernelWidth; i++ ) { // if( i < 0 || i >= endX || j < 0 || j >= endY || k < 0 || k >= endZ ) { // result = convolve( result, fetchBoundary( i, j, k ) ); // } // else { // result = convolve( result, fetchData( i, j, k ), fetchKernel( i, j, k ) ); // } // } // } // } // store( ix, iy, iz, result ); // } template<> struct Convolution< 3, TNL::Devices::Cuda > { public: template< typename Index > static size_t getDynamicSharedMemorySize( Index kernelWidth, Index kernelHeight, Index kernelDepth, Index endX, Index endY, Index endZ ) { return 0; } }; template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store > __global__ static void convolution3D( Index kernelWidth, Index kernelHeight, Index kernelDepth, Index endX, Index endY, Index endZ, FetchData fetchData, FetchBoundary fetchBoundary, FetchKernel fetchKernel, Convolve convolve, Store store ) { Index iz = threadIdx.z + blockIdx.z * blockDim.z; Index iy = threadIdx.y + blockIdx.y * blockDim.y; Index ix = threadIdx.x + blockIdx.x * blockDim.x; if (ix >= endX || iy >= endY || iz >= endZ) return; Index radiusZ = kernelDepth >> 1; Index radiusY = kernelHeight >> 1; Index radiusX = kernelWidth >> 1; Real result = 0; for( Index k = -radiusZ; k <= radiusZ; k++ ) { Index elementIndexZ = k + iz; Index kernelIndexZ = k + radiusZ; for( Index j = -radiusY; j <= radiusY; j++ ) { Index elementIndexY = j + iy; Index kernelIndexY = j + radiusY; for( Index i = -radiusX; i <= radiusX; i++ ) { Index elementIndexX = i + ix; Index kernelIndexX = i + radiusX; if( elementIndexX < 0 || elementIndexX >= endX || elementIndexY < 0 || elementIndexY >= endY || elementIndexZ < 0 || elementIndexZ >= endZ ) { result = convolve( result, fetchBoundary( elementIndexX, elementIndexY, elementIndexZ ), fetchKernel( kernelIndexX, kernelIndexY, kernelIndexZ ) ); } else { result = convolve( result, fetchData( elementIndexX, elementIndexY, elementIndexZ ), fetchKernel( kernelIndexX, kernelIndexY, kernelIndexZ ) ); } } } } store( ix, iy, iz, result ); } #endif src/Benchmarks/Convolution/support/DummyTask.h +47 −37 Original line number Diff line number Diff line Loading @@ -17,7 +17,7 @@ public: using Launcher = Launcher< Dimension, Device >; static void exec( const Vector& dimensions, const Vector& kernelSize, DataStore input, DataStore result, DataStore kernel ) exec( const Vector& dimensions, const Vector& kernelSize, DataStore& input, DataStore& result, DataStore& kernel ) { auto fetchData = [ = ] __cuda_callable__( Index i ) { Loading Loading @@ -108,46 +108,56 @@ public: } }; // template< typename Index, typename Real > // struct DummyTask< Index, Real, 3, TNL::Devices::Cuda > // { // public: // static constexpr int Dimension = 3; // using Device = TNL::Devices::Cuda; // using Vector = TNL::Containers::StaticVector< Dimension, Index >; // using DataStore = typename TNL::Containers::Array< Real, Device, Index >::ViewType; // using Launcher = Launcher< Dimension, Device >; // static void // exec( const Vector& dimensions, const Vector& kernelSize, DataStore& input, DataStore& result, DataStore& kernel ) // { // auto fetchData = [ = ] __cuda_callable__( Index i, Index j, Index k ) { template< typename Index, typename Real > struct DummyTask< Index, Real, 3, TNL::Devices::Cuda > { public: static constexpr int Dimension = 3; using Device = TNL::Devices::Cuda; using Vector = TNL::Containers::StaticVector< Dimension, Index >; using DataStore = typename TNL::Containers::Array< Real, Device, Index >::ViewType; using Launcher = Launcher< Dimension, Device >; // }; static void exec( const Vector& dimensions, const Vector& kernelSize, DataStore& input, DataStore& result, DataStore& kernel ) { auto fetchData = [ = ] __cuda_callable__( Index i, Index j, Index k ) { auto index = i + j * dimensions.x() + k * dimensions.x() * dimensions.y(); // auto fetchBoundary = [ = ] __cuda_callable__( Index i, Index j, Index k ) { return input[index]; }; // }; auto fetchBoundary = [ = ] __cuda_callable__( Index i, Index j, Index k ) { return 1; }; // auto fetchKernel = [ = ] __cuda_callable__( Index i, Index j, Index k ) { auto fetchKernel = [ = ] __cuda_callable__( Index i, Index j, Index k ) { auto index = i + j * kernelSize.x() + k * kernelSize.x() * kernelSize.y(); // }; return kernel[ index ]; }; // auto convolve = [ = ] __cuda_callable__( float result, Index data, Index kernel ) // { // return result + data * kernel; // }; auto convolve = [ = ] __cuda_callable__( float result, Index data, Index kernel ) { return result + data * kernel; }; // auto store = [ = ] __cuda_callable__( Index i, Index j, Index k, Real result ) { auto store = [ = ] __cuda_callable__( Index i, Index j, Index k, Real resultValue ) mutable { auto index = i + j * dimensions.x() + k * dimensions.x() * dimensions.y(); // }; result[ index ] = resultValue; }; // Launcher::exec< Index >( dimensions, // kernelSize, // std::forward< decltype( fetchData ) >( fetchData ), // std::forward< decltype( fetchBoundary ) >( fetchBoundary ), // std::forward< decltype( fetchKernel ) >( fetchKernel ), // std::forward< decltype( convolve ) >( convolve ), // std::forward< decltype( store ) >( store ) ); // } // }; Launcher::exec< Index, Real >( dimensions, kernelSize, std::forward< decltype( fetchData ) >( fetchData ), std::forward< decltype( fetchBoundary ) >( fetchBoundary ), std::forward< decltype( fetchKernel ) >( fetchKernel ), std::forward< decltype( convolve ) >( convolve ), std::forward< decltype( store ) >( store ) ); } }; src/Benchmarks/Convolution/support/Launcher.h +88 −92 Original line number Diff line number Diff line Loading @@ -119,95 +119,91 @@ public: } }; // template<> // struct Launcher< 3, TNL::Devices::Cuda > // { // public: // using Vector = TNL::Containers::StaticVector< 3, int >; // using ConvolutionKernel = Convolution< 3, TNL::Devices::Cuda >; // template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store > // static inline void // exec( const Vector& dimensions, // const Vector& kernelSize, // FetchData&& fetchData, // FetchBoundary&& fetchBoundary, // FetchKernel&& fetchKernel, // Convolve&& convolve, // Store&& store ) // { // const Index sizeX = dimensions.x(); // const Index sizeY = dimensions.y(); // const Index sizeZ = dimensions.z(); // TNL::Cuda::LaunchConfiguration launchConfig; // launchConfig.dynamicSharedMemorySize = ConvolutionKernel::getDynamicSharedMemorySize< Index >( // kernelSize.x(), kernelSize.y(), kernelSize.z(), dimensions.x(), dimensions.y(), dimensions.z() ); // if( sizeX >= sizeY * sizeY * sizeZ * sizeZ ) { // launchConfig.blockSize.x = TNL::min( 256, sizeX ); // launchConfig.blockSize.y = 1; // launchConfig.blockSize.z = 1; // } // else if( sizeY >= sizeX * sizeX * sizeZ * sizeZ ) { // launchConfig.blockSize.x = 1; // launchConfig.blockSize.y = TNL::min( 256, sizeY ); // launchConfig.blockSize.z = 1; // } // else if( sizeZ >= sizeX * sizeX * sizeY * sizeY ) { // launchConfig.blockSize.x = TNL::min( 2, sizeX ); // launchConfig.blockSize.y = TNL::min( 2, sizeY ); // // CUDA allows max 64 for launchConfig.blockSize.z // launchConfig.blockSize.z = TNL::min( 64, sizeZ ); // } // else if( sizeX >= sizeZ * sizeZ && sizeY >= sizeZ * sizeZ ) { // launchConfig.blockSize.x = TNL::min( 32, sizeX ); // launchConfig.blockSize.y = TNL::min( 8, sizeY ); // launchConfig.blockSize.z = 1; // } // else if( sizeX >= sizeY * sizeY && sizeZ >= sizeY * sizeY ) { // launchConfig.blockSize.x = TNL::min( 32, sizeX ); // launchConfig.blockSize.y = 1; // launchConfig.blockSize.z = TNL::min( 8, sizeZ ); // } // else if( sizeY >= sizeX * sizeX && sizeZ >= sizeX * sizeX ) { // launchConfig.blockSize.x = 1; // launchConfig.blockSize.y = TNL::min( 32, sizeY ); // launchConfig.blockSize.z = TNL::min( 8, sizeZ ); // } // else { // launchConfig.blockSize.x = TNL::min( 16, sizeX ); // launchConfig.blockSize.y = TNL::min( 4, sizeY ); // launchConfig.blockSize.z = TNL::min( 4, sizeZ ); // } // launchConfig.gridSize.x = // TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeX, launchConfig.blockSize.x ) ); // launchConfig.gridSize.y = // TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeY, launchConfig.blockSize.y ) ); // launchConfig.gridSize.z = // TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeZ, launchConfig.blockSize.z ) ); // dim3 gridCount; // gridCount.x = roundUpDivision( sizeX, launchConfig.blockSize.x * launchConfig.gridSize.x ); // gridCount.y = roundUpDivision( sizeY, launchConfig.blockSize.y * launchConfig.gridSize.y ); // gridCount.z = roundUpDivision( sizeZ, launchConfig.blockSize.z * launchConfig.gridSize.z ); // constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; // TNL::Cuda::launchKernel< true >( kernel, // 0, // launchConfig, // kernelSize.x(), // kernelSize.y(), // kernelSize.z(), // dimensions.x(), // dimensions.y(), // dimensions.z(), // std::forward< FetchData >( fetchData ), // std::forward< FetchBoundary >( fetchBoundary ), // std::forward< FetchKernel >( fetchKernel ), // std::forward< Convolve >( convolve ), // std::forward< Store >( store ) ); // } // }; template<> struct Launcher< 3, TNL::Devices::Cuda > { public: using Vector = TNL::Containers::StaticVector< 3, int >; using ConvolutionKernel = Convolution< 3, TNL::Devices::Cuda >; template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store > static inline void exec( const Vector& dimensions, const Vector& kernelSize, FetchData&& fetchData, FetchBoundary&& fetchBoundary, FetchKernel&& fetchKernel, Convolve&& convolve, Store&& store ) { const Index sizeX = dimensions.x(); const Index sizeY = dimensions.y(); const Index sizeZ = dimensions.z(); TNL::Cuda::LaunchConfiguration launchConfig; launchConfig.dynamicSharedMemorySize = ConvolutionKernel::getDynamicSharedMemorySize< Index >( kernelSize.x(), kernelSize.y(), kernelSize.z(), dimensions.x(), dimensions.y(), dimensions.z() ); if( sizeX >= sizeY * sizeY * sizeZ * sizeZ ) { launchConfig.blockSize.x = TNL::min( 256, sizeX ); launchConfig.blockSize.y = 1; launchConfig.blockSize.z = 1; } else if( sizeY >= sizeX * sizeX * sizeZ * sizeZ ) { launchConfig.blockSize.x = 1; launchConfig.blockSize.y = TNL::min( 256, sizeY ); launchConfig.blockSize.z = 1; } else if( sizeZ >= sizeX * sizeX * sizeY * sizeY ) { launchConfig.blockSize.x = TNL::min( 2, sizeX ); launchConfig.blockSize.y = TNL::min( 2, sizeY ); // CUDA allows max 64 for launchConfig.blockSize.z launchConfig.blockSize.z = TNL::min( 64, sizeZ ); } else if( sizeX >= sizeZ * sizeZ && sizeY >= sizeZ * sizeZ ) { launchConfig.blockSize.x = TNL::min( 32, sizeX ); launchConfig.blockSize.y = TNL::min( 8, sizeY ); launchConfig.blockSize.z = 1; } else if( sizeX >= sizeY * sizeY && sizeZ >= sizeY * sizeY ) { launchConfig.blockSize.x = TNL::min( 32, sizeX ); launchConfig.blockSize.y = 1; launchConfig.blockSize.z = TNL::min( 8, sizeZ ); } else if( sizeY >= sizeX * sizeX && sizeZ >= sizeX * sizeX ) { launchConfig.blockSize.x = 1; launchConfig.blockSize.y = TNL::min( 32, sizeY ); launchConfig.blockSize.z = TNL::min( 8, sizeZ ); } else { launchConfig.blockSize.x = TNL::min( 16, sizeX ); launchConfig.blockSize.y = TNL::min( 4, sizeY ); launchConfig.blockSize.z = TNL::min( 4, sizeZ ); } launchConfig.gridSize.x = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeX, launchConfig.blockSize.x ) ); launchConfig.gridSize.y = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeY, launchConfig.blockSize.y ) ); launchConfig.gridSize.z = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeZ, launchConfig.blockSize.z ) ); constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; TNL::Cuda::launchKernel< true >( kernel, 0, launchConfig, kernelSize.x(), kernelSize.y(), kernelSize.z(), dimensions.x(), dimensions.y(), dimensions.z(), fetchData, fetchBoundary, fetchKernel, convolve, store ); } }; Loading
src/Benchmarks/Convolution/CMakeLists.txt +1 −1 Original line number Diff line number Diff line Loading @@ -13,7 +13,7 @@ if (${BUILD_CUDA}) FILE(WRITE ${SOURCE_FILE} "${TEMPLATE_CONTENT}") SET(EXECUTABLE_NAME "${PREFIX}_${DIMENSION}_${MODULE_NAME}") SET(EXECUTABLE_NAME "${PREFIX}_${DIMENSION}_${MODULE_NAME}_${TEMPLATE_NAME}") CUDA_ADD_EXECUTABLE(${EXECUTABLE_NAME} ${SOURCE_FILE}) else() Loading
src/Benchmarks/Convolution/kernels/naive.h +78 −55 Original line number Diff line number Diff line Loading @@ -37,6 +37,10 @@ convolution1D( Index kernelWidth, Store store ) { Index ix = threadIdx.x + blockIdx.x * blockDim.x; if (ix >= endX) return; Index radius = kernelWidth >> 1; Real result = 0; Loading Loading @@ -90,8 +94,11 @@ convolution2D( Index kernelWidth, Index iy = threadIdx.y + blockIdx.y * blockDim.y; Index ix = threadIdx.x + blockIdx.x * blockDim.x; if (ix >= endX || iy >= endY) return; Index radiusY = kernelHeight >> 1; Index radiusX = kernelHeight >> 1; Index radiusX = kernelWidth >> 1; Real result = 0; Loading @@ -115,59 +122,75 @@ convolution2D( Index kernelWidth, store( ix, iy, result ); } // template<> // struct Convolution< 3, TNL::Devices::Cuda > // { // public: // template< typename Index > // static size_t // getDynamicSharedMemorySize( Index kernelWidth, Index kernelHeight, Index kernelDepth, Index endX, Index endY, Index endZ ) // { // return 0; // } // }; // template< typename Index, // typename Real, // typename FetchData, // typename FetchBoundary, // typename FetchKernel, // typename Convolve, // typename Store > // __global__ // static void // convolution3D( Index kernelWidth, // Index kernelHeight, // Index kernelDepth, // Index endX, // Index endY, // Index endZ, // FetchData& fetchData, // FetchBoundary& fetchBoundary, // FetchKernel& fetchKernel, // Convolve& convolve, // Store& store ) // { // int ix = threadIdx.x + blockIdx.x * blockDim.x; // int iy = threadIdx.y + blockIdx.y * blockDim.y; // int iz = threadIdx.z + blockIdx.z * blockDim.z; // Real result = 0; // for( Index k = iz - kernelDepth; k <= iz + kernelDepth; k++ ) { // for( Index j = iy - kernelHeight; j <= iy + kernelHeight; j++ ) { // for( Index i = ix - kernelWidth; i <= ix + kernelWidth; i++ ) { // if( i < 0 || i >= endX || j < 0 || j >= endY || k < 0 || k >= endZ ) { // result = convolve( result, fetchBoundary( i, j, k ) ); // } // else { // result = convolve( result, fetchData( i, j, k ), fetchKernel( i, j, k ) ); // } // } // } // } // store( ix, iy, iz, result ); // } template<> struct Convolution< 3, TNL::Devices::Cuda > { public: template< typename Index > static size_t getDynamicSharedMemorySize( Index kernelWidth, Index kernelHeight, Index kernelDepth, Index endX, Index endY, Index endZ ) { return 0; } }; template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store > __global__ static void convolution3D( Index kernelWidth, Index kernelHeight, Index kernelDepth, Index endX, Index endY, Index endZ, FetchData fetchData, FetchBoundary fetchBoundary, FetchKernel fetchKernel, Convolve convolve, Store store ) { Index iz = threadIdx.z + blockIdx.z * blockDim.z; Index iy = threadIdx.y + blockIdx.y * blockDim.y; Index ix = threadIdx.x + blockIdx.x * blockDim.x; if (ix >= endX || iy >= endY || iz >= endZ) return; Index radiusZ = kernelDepth >> 1; Index radiusY = kernelHeight >> 1; Index radiusX = kernelWidth >> 1; Real result = 0; for( Index k = -radiusZ; k <= radiusZ; k++ ) { Index elementIndexZ = k + iz; Index kernelIndexZ = k + radiusZ; for( Index j = -radiusY; j <= radiusY; j++ ) { Index elementIndexY = j + iy; Index kernelIndexY = j + radiusY; for( Index i = -radiusX; i <= radiusX; i++ ) { Index elementIndexX = i + ix; Index kernelIndexX = i + radiusX; if( elementIndexX < 0 || elementIndexX >= endX || elementIndexY < 0 || elementIndexY >= endY || elementIndexZ < 0 || elementIndexZ >= endZ ) { result = convolve( result, fetchBoundary( elementIndexX, elementIndexY, elementIndexZ ), fetchKernel( kernelIndexX, kernelIndexY, kernelIndexZ ) ); } else { result = convolve( result, fetchData( elementIndexX, elementIndexY, elementIndexZ ), fetchKernel( kernelIndexX, kernelIndexY, kernelIndexZ ) ); } } } } store( ix, iy, iz, result ); } #endif
src/Benchmarks/Convolution/support/DummyTask.h +47 −37 Original line number Diff line number Diff line Loading @@ -17,7 +17,7 @@ public: using Launcher = Launcher< Dimension, Device >; static void exec( const Vector& dimensions, const Vector& kernelSize, DataStore input, DataStore result, DataStore kernel ) exec( const Vector& dimensions, const Vector& kernelSize, DataStore& input, DataStore& result, DataStore& kernel ) { auto fetchData = [ = ] __cuda_callable__( Index i ) { Loading Loading @@ -108,46 +108,56 @@ public: } }; // template< typename Index, typename Real > // struct DummyTask< Index, Real, 3, TNL::Devices::Cuda > // { // public: // static constexpr int Dimension = 3; // using Device = TNL::Devices::Cuda; // using Vector = TNL::Containers::StaticVector< Dimension, Index >; // using DataStore = typename TNL::Containers::Array< Real, Device, Index >::ViewType; // using Launcher = Launcher< Dimension, Device >; // static void // exec( const Vector& dimensions, const Vector& kernelSize, DataStore& input, DataStore& result, DataStore& kernel ) // { // auto fetchData = [ = ] __cuda_callable__( Index i, Index j, Index k ) { template< typename Index, typename Real > struct DummyTask< Index, Real, 3, TNL::Devices::Cuda > { public: static constexpr int Dimension = 3; using Device = TNL::Devices::Cuda; using Vector = TNL::Containers::StaticVector< Dimension, Index >; using DataStore = typename TNL::Containers::Array< Real, Device, Index >::ViewType; using Launcher = Launcher< Dimension, Device >; // }; static void exec( const Vector& dimensions, const Vector& kernelSize, DataStore& input, DataStore& result, DataStore& kernel ) { auto fetchData = [ = ] __cuda_callable__( Index i, Index j, Index k ) { auto index = i + j * dimensions.x() + k * dimensions.x() * dimensions.y(); // auto fetchBoundary = [ = ] __cuda_callable__( Index i, Index j, Index k ) { return input[index]; }; // }; auto fetchBoundary = [ = ] __cuda_callable__( Index i, Index j, Index k ) { return 1; }; // auto fetchKernel = [ = ] __cuda_callable__( Index i, Index j, Index k ) { auto fetchKernel = [ = ] __cuda_callable__( Index i, Index j, Index k ) { auto index = i + j * kernelSize.x() + k * kernelSize.x() * kernelSize.y(); // }; return kernel[ index ]; }; // auto convolve = [ = ] __cuda_callable__( float result, Index data, Index kernel ) // { // return result + data * kernel; // }; auto convolve = [ = ] __cuda_callable__( float result, Index data, Index kernel ) { return result + data * kernel; }; // auto store = [ = ] __cuda_callable__( Index i, Index j, Index k, Real result ) { auto store = [ = ] __cuda_callable__( Index i, Index j, Index k, Real resultValue ) mutable { auto index = i + j * dimensions.x() + k * dimensions.x() * dimensions.y(); // }; result[ index ] = resultValue; }; // Launcher::exec< Index >( dimensions, // kernelSize, // std::forward< decltype( fetchData ) >( fetchData ), // std::forward< decltype( fetchBoundary ) >( fetchBoundary ), // std::forward< decltype( fetchKernel ) >( fetchKernel ), // std::forward< decltype( convolve ) >( convolve ), // std::forward< decltype( store ) >( store ) ); // } // }; Launcher::exec< Index, Real >( dimensions, kernelSize, std::forward< decltype( fetchData ) >( fetchData ), std::forward< decltype( fetchBoundary ) >( fetchBoundary ), std::forward< decltype( fetchKernel ) >( fetchKernel ), std::forward< decltype( convolve ) >( convolve ), std::forward< decltype( store ) >( store ) ); } };
src/Benchmarks/Convolution/support/Launcher.h +88 −92 Original line number Diff line number Diff line Loading @@ -119,95 +119,91 @@ public: } }; // template<> // struct Launcher< 3, TNL::Devices::Cuda > // { // public: // using Vector = TNL::Containers::StaticVector< 3, int >; // using ConvolutionKernel = Convolution< 3, TNL::Devices::Cuda >; // template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store > // static inline void // exec( const Vector& dimensions, // const Vector& kernelSize, // FetchData&& fetchData, // FetchBoundary&& fetchBoundary, // FetchKernel&& fetchKernel, // Convolve&& convolve, // Store&& store ) // { // const Index sizeX = dimensions.x(); // const Index sizeY = dimensions.y(); // const Index sizeZ = dimensions.z(); // TNL::Cuda::LaunchConfiguration launchConfig; // launchConfig.dynamicSharedMemorySize = ConvolutionKernel::getDynamicSharedMemorySize< Index >( // kernelSize.x(), kernelSize.y(), kernelSize.z(), dimensions.x(), dimensions.y(), dimensions.z() ); // if( sizeX >= sizeY * sizeY * sizeZ * sizeZ ) { // launchConfig.blockSize.x = TNL::min( 256, sizeX ); // launchConfig.blockSize.y = 1; // launchConfig.blockSize.z = 1; // } // else if( sizeY >= sizeX * sizeX * sizeZ * sizeZ ) { // launchConfig.blockSize.x = 1; // launchConfig.blockSize.y = TNL::min( 256, sizeY ); // launchConfig.blockSize.z = 1; // } // else if( sizeZ >= sizeX * sizeX * sizeY * sizeY ) { // launchConfig.blockSize.x = TNL::min( 2, sizeX ); // launchConfig.blockSize.y = TNL::min( 2, sizeY ); // // CUDA allows max 64 for launchConfig.blockSize.z // launchConfig.blockSize.z = TNL::min( 64, sizeZ ); // } // else if( sizeX >= sizeZ * sizeZ && sizeY >= sizeZ * sizeZ ) { // launchConfig.blockSize.x = TNL::min( 32, sizeX ); // launchConfig.blockSize.y = TNL::min( 8, sizeY ); // launchConfig.blockSize.z = 1; // } // else if( sizeX >= sizeY * sizeY && sizeZ >= sizeY * sizeY ) { // launchConfig.blockSize.x = TNL::min( 32, sizeX ); // launchConfig.blockSize.y = 1; // launchConfig.blockSize.z = TNL::min( 8, sizeZ ); // } // else if( sizeY >= sizeX * sizeX && sizeZ >= sizeX * sizeX ) { // launchConfig.blockSize.x = 1; // launchConfig.blockSize.y = TNL::min( 32, sizeY ); // launchConfig.blockSize.z = TNL::min( 8, sizeZ ); // } // else { // launchConfig.blockSize.x = TNL::min( 16, sizeX ); // launchConfig.blockSize.y = TNL::min( 4, sizeY ); // launchConfig.blockSize.z = TNL::min( 4, sizeZ ); // } // launchConfig.gridSize.x = // TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeX, launchConfig.blockSize.x ) ); // launchConfig.gridSize.y = // TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeY, launchConfig.blockSize.y ) ); // launchConfig.gridSize.z = // TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeZ, launchConfig.blockSize.z ) ); // dim3 gridCount; // gridCount.x = roundUpDivision( sizeX, launchConfig.blockSize.x * launchConfig.gridSize.x ); // gridCount.y = roundUpDivision( sizeY, launchConfig.blockSize.y * launchConfig.gridSize.y ); // gridCount.z = roundUpDivision( sizeZ, launchConfig.blockSize.z * launchConfig.gridSize.z ); // constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; // TNL::Cuda::launchKernel< true >( kernel, // 0, // launchConfig, // kernelSize.x(), // kernelSize.y(), // kernelSize.z(), // dimensions.x(), // dimensions.y(), // dimensions.z(), // std::forward< FetchData >( fetchData ), // std::forward< FetchBoundary >( fetchBoundary ), // std::forward< FetchKernel >( fetchKernel ), // std::forward< Convolve >( convolve ), // std::forward< Store >( store ) ); // } // }; template<> struct Launcher< 3, TNL::Devices::Cuda > { public: using Vector = TNL::Containers::StaticVector< 3, int >; using ConvolutionKernel = Convolution< 3, TNL::Devices::Cuda >; template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store > static inline void exec( const Vector& dimensions, const Vector& kernelSize, FetchData&& fetchData, FetchBoundary&& fetchBoundary, FetchKernel&& fetchKernel, Convolve&& convolve, Store&& store ) { const Index sizeX = dimensions.x(); const Index sizeY = dimensions.y(); const Index sizeZ = dimensions.z(); TNL::Cuda::LaunchConfiguration launchConfig; launchConfig.dynamicSharedMemorySize = ConvolutionKernel::getDynamicSharedMemorySize< Index >( kernelSize.x(), kernelSize.y(), kernelSize.z(), dimensions.x(), dimensions.y(), dimensions.z() ); if( sizeX >= sizeY * sizeY * sizeZ * sizeZ ) { launchConfig.blockSize.x = TNL::min( 256, sizeX ); launchConfig.blockSize.y = 1; launchConfig.blockSize.z = 1; } else if( sizeY >= sizeX * sizeX * sizeZ * sizeZ ) { launchConfig.blockSize.x = 1; launchConfig.blockSize.y = TNL::min( 256, sizeY ); launchConfig.blockSize.z = 1; } else if( sizeZ >= sizeX * sizeX * sizeY * sizeY ) { launchConfig.blockSize.x = TNL::min( 2, sizeX ); launchConfig.blockSize.y = TNL::min( 2, sizeY ); // CUDA allows max 64 for launchConfig.blockSize.z launchConfig.blockSize.z = TNL::min( 64, sizeZ ); } else if( sizeX >= sizeZ * sizeZ && sizeY >= sizeZ * sizeZ ) { launchConfig.blockSize.x = TNL::min( 32, sizeX ); launchConfig.blockSize.y = TNL::min( 8, sizeY ); launchConfig.blockSize.z = 1; } else if( sizeX >= sizeY * sizeY && sizeZ >= sizeY * sizeY ) { launchConfig.blockSize.x = TNL::min( 32, sizeX ); launchConfig.blockSize.y = 1; launchConfig.blockSize.z = TNL::min( 8, sizeZ ); } else if( sizeY >= sizeX * sizeX && sizeZ >= sizeX * sizeX ) { launchConfig.blockSize.x = 1; launchConfig.blockSize.y = TNL::min( 32, sizeY ); launchConfig.blockSize.z = TNL::min( 8, sizeZ ); } else { launchConfig.blockSize.x = TNL::min( 16, sizeX ); launchConfig.blockSize.y = TNL::min( 4, sizeY ); launchConfig.blockSize.z = TNL::min( 4, sizeZ ); } launchConfig.gridSize.x = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeX, launchConfig.blockSize.x ) ); launchConfig.gridSize.y = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeY, launchConfig.blockSize.y ) ); launchConfig.gridSize.z = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeZ, launchConfig.blockSize.z ) ); constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; TNL::Cuda::launchKernel< true >( kernel, 0, launchConfig, kernelSize.x(), kernelSize.y(), kernelSize.z(), dimensions.x(), dimensions.y(), dimensions.z(), fetchData, fetchBoundary, fetchKernel, convolve, store ); } };