Loading src/Benchmarks/Convolution/CMakeLists.txt +1 −0 Original line number Diff line number Diff line Loading @@ -23,3 +23,4 @@ endif() endfunction() GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_solver.h" "kernels/naive.h") GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_solver.h" "kernels/naive.h") src/Benchmarks/Convolution/kernels/naive.h +54 −45 Original line number Diff line number Diff line Loading @@ -56,55 +56,64 @@ convolution1D( Index kernelWidth, store( ix, result ); } // template<> // struct Convolution< 2, TNL::Devices::Cuda > // { // public: // template< typename Index > // static size_t // getDynamicSharedMemorySize( Index kernelWidth, Index kernelHeight, Index endX, Index endY ) // { // return 0; // } // }; template<> struct Convolution< 2, TNL::Devices::Cuda > { public: template< typename Index > static size_t getDynamicSharedMemorySize( Index kernelWidth, Index kernelHeight, Index endX, Index endY ) { return 0; } }; // template< typename Index, // typename Real, // typename FetchData, // typename FetchBoundary, // typename FetchKernel, // typename Convolve, // typename Store > // __global__ // static void // convolution2D( Index kernelWidth, // Index kernelHeight, // Index endX, // Index endY, // FetchData& fetchData, // FetchBoundary& fetchBoundary, // FetchKernel& fetchKernel, // Convolve& convolve, // Store& store ) // { // int iy = threadIdx.y + blockIdx.y * blockDim.y; // int ix = threadIdx.x + blockIdx.x * blockDim.x; template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store > __global__ static void convolution2D( Index kernelWidth, Index kernelHeight, Index endX, Index endY, FetchData fetchData, FetchBoundary fetchBoundary, FetchKernel fetchKernel, Convolve convolve, Store store ) { Index iy = threadIdx.y + blockIdx.y * blockDim.y; Index ix = threadIdx.x + blockIdx.x * blockDim.x; // Real result = 0; Index radiusY = kernelHeight >> 1; Index radiusX = kernelHeight >> 1; // for( Index j = iy - kernelHeight; j <= iy + kernelHeight; j++ ) { // for( Index i = ix - kernelWidth; i <= ix + kernelWidth; i++ ) { // if( i < 0 || i >= endX || j < 0 || j >= endY ) { // result = convolve( result, fetchBoundary( i, j ) ); // } // else { // result = convolve( result, fetchData( i, j ), fetchKernel( i, j ) ); // } // } // } Real result = 0; // store( ix, iy, result ); // } for( Index j = - radiusY; j <= radiusY; j++ ) { Index elementIndexY = j + iy; Index kernelIndexY = j + radiusY; for( Index i = - radiusX; i <= radiusX; i++ ) { Index elementIndexX = i + ix; Index kernelIndexX = i + radiusX; if( elementIndexX < 0 || elementIndexX >= endX || elementIndexY < 0 || elementIndexY >= endY ) { result = convolve( result, fetchBoundary( elementIndexX, elementIndexY ), fetchKernel ( kernelIndexX, kernelIndexY ) ); } else { result = convolve( result, fetchData( elementIndexX, elementIndexY ), fetchKernel( kernelIndexX, kernelIndexY ) ); } } } store( ix, iy, result ); } // template<> // struct Convolution< 3, TNL::Devices::Cuda > Loading src/Benchmarks/Convolution/support/DummyTask.h +44 −44 Original line number Diff line number Diff line Loading @@ -54,59 +54,59 @@ public: } }; // template< typename Index, typename Real > // struct DummyTask< Index, Real, 2, TNL::Devices::Cuda > // { // public: // static constexpr int Dimension = 2; // using Device = TNL::Devices::Cuda; // using Vector = TNL::Containers::StaticVector< Dimension, Index >; // using DataStore = typename TNL::Containers::Array< Real, Device, Index >::ViewType; // using Launcher = Launcher< Dimension, Device >; template< typename Index, typename Real > struct DummyTask< Index, Real, 2, TNL::Devices::Cuda > { public: static constexpr int Dimension = 2; using Device = TNL::Devices::Cuda; using Vector = TNL::Containers::StaticVector< Dimension, Index >; using DataStore = typename TNL::Containers::Array< Real, Device, Index >::ViewType; using Launcher = Launcher< Dimension, Device >; // static void // exec( const Vector& dimensions, const Vector& kernelSize, DataStore& input, DataStore& result, DataStore& kernel ) // { // auto fetchData = [ = ] __cuda_callable__( Index i, Index j ) // { // auto index = i + j * dimensions.x(); static void exec( const Vector& dimensions, const Vector& kernelSize, DataStore& input, DataStore& result, DataStore& kernel ) { auto fetchData = [ = ] __cuda_callable__( Index i, Index j ) { auto index = i + j * dimensions.x(); // return input[ index ]; // }; return input[ index ]; }; // auto fetchBoundary = [ = ] __cuda_callable__( Index i, Index j ) // { // return -1; // }; auto fetchBoundary = [ = ] __cuda_callable__( Index i, Index j ) { return -1; }; // auto fetchKernel = [ = ] __cuda_callable__( Index i, Index j ) // { // auto index = i + j * kernel.x(); auto fetchKernel = [ = ] __cuda_callable__( Index i, Index j ) { auto index = i + j * kernelSize.x(); // return kernel[ index ]; // }; return kernel[ index ]; }; // auto convolve = [ = ] __cuda_callable__( Real result, Index data, Index kernel ) // { // return result + data * kernel; // }; auto convolve = [ = ] __cuda_callable__( Real result, Index data, Index kernel ) { return result + data * kernel; }; // auto store = [ = ] __cuda_callable__( Index i, Index j, Real resultValue ) // { // auto index = i + j * dimensions.x(); auto store = [ = ] __cuda_callable__( Index i, Index j, Real resultValue ) mutable { auto index = i + j * dimensions.x(); // result[ index ] = resultValue; // }; result[ index ] = resultValue; }; // Launcher::exec< Index >( dimensions, // kernelSize, // std::forward< decltype( fetchData ) >( fetchData ), // std::forward< decltype( fetchBoundary ) >( fetchBoundary ), // std::forward< decltype( fetchKernel ) >( fetchKernel ), // std::forward< decltype( convolve ) >( convolve ), // std::forward< decltype( store ) >( store ) ); // } // }; Launcher::exec< Index, Real >( dimensions, kernelSize, std::forward< decltype( fetchData ) >( fetchData ), std::forward< decltype( fetchBoundary ) >( fetchBoundary ), std::forward< decltype( fetchKernel ) >( fetchKernel ), std::forward< decltype( convolve ) >( convolve ), std::forward< decltype( store ) >( store ) ); } }; // template< typename Index, typename Real > // struct DummyTask< Index, Real, 3, TNL::Devices::Cuda > Loading src/Benchmarks/Convolution/support/Launcher.h +52 −57 Original line number Diff line number Diff line Loading @@ -59,70 +59,65 @@ public: } }; // template<> // struct Launcher< 2, TNL::Devices::Cuda > // { // public: // using Vector = TNL::Containers::StaticVector< 2, int >; // using ConvolutionKernel = Convolution< 2, TNL::Devices::Cuda >; // template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store > // static inline void // exec( const Vector& dimensions, // const Vector& kernelSize, // FetchData&& fetchData, // FetchBoundary&& fetchBoundary, // FetchKernel&& fetchKernel, // Convolve&& convolve, // Store&& store ) // { // TNL::Cuda::LaunchConfiguration launchConfig; // launchConfig.dynamicSharedMemorySize = ConvolutionKernel::getDynamicSharedMemorySize< Index >( // kernelSize.x(), kernelSize.y(), dimensions.x(), dimensions.y() ); template<> struct Launcher< 2, TNL::Devices::Cuda > { public: using Vector = TNL::Containers::StaticVector< 2, int >; using ConvolutionKernel = Convolution< 2, TNL::Devices::Cuda >; // const Index sizeX = dimensions.x(); // const Index sizeY = dimensions.y(); template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store > static inline void exec( const Vector& dimensions, const Vector& kernelSize, FetchData&& fetchData, FetchBoundary&& fetchBoundary, FetchKernel&& fetchKernel, Convolve&& convolve, Store&& store ) { TNL::Cuda::LaunchConfiguration launchConfig; // if( sizeX >= sizeY * sizeY ) { // launchConfig.blockSize.x = TNL::min( 256, sizeX ); // launchConfig.blockSize.y = 1; // } // else if( sizeY >= sizeX * sizeX ) { // launchConfig.blockSize.x = 1; // launchConfig.blockSize.y = TNL::min( 256, sizeY ); // } // else { // launchConfig.blockSize.x = TNL::min( 32, sizeX ); // launchConfig.blockSize.y = TNL::min( 8, sizeY ); // } launchConfig.dynamicSharedMemorySize = ConvolutionKernel::getDynamicSharedMemorySize< Index >( kernelSize.x(), kernelSize.y(), dimensions.x(), dimensions.y() ); // launchConfig.gridSize.x = // TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeX, launchConfig.blockSize.x ) ); // launchConfig.gridSize.y = // TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeY, launchConfig.blockSize.y ) ); const Index sizeX = dimensions.x(); const Index sizeY = dimensions.y(); // dim3 gridCount; if( sizeX >= sizeY * sizeY ) { launchConfig.blockSize.x = TNL::min( 256, sizeX ); launchConfig.blockSize.y = 1; } else if( sizeY >= sizeX * sizeX ) { launchConfig.blockSize.x = 1; launchConfig.blockSize.y = TNL::min( 256, sizeY ); } else { launchConfig.blockSize.x = TNL::min( 32, sizeX ); launchConfig.blockSize.y = TNL::min( 8, sizeY ); } // gridCount.x = roundUpDivision( sizeX, launchConfig.blockSize.x * launchConfig.gridSize.x ); // gridCount.y = roundUpDivision( sizeY, launchConfig.blockSize.y * launchConfig.gridSize.y ); launchConfig.gridSize.x = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeX, launchConfig.blockSize.x ) ); launchConfig.gridSize.y = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeY, launchConfig.blockSize.y ) ); // constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; // TNL::Cuda::launchKernel< true >( kernel, // 0, // launchConfig, // kernelSize.x(), // kernelSize.y(), // dimensions.x(), // dimensions.y(), // std::forward< FetchData >( fetchData ), // std::forward< FetchBoundary >( fetchBoundary ), // std::forward< FetchKernel >( fetchKernel ), // std::forward< Convolve >( convolve ), // std::forward< Store >( store ) ); // } // }; TNL::Cuda::launchKernel< true >( kernel, 0, launchConfig, kernelSize.x(), kernelSize.y(), dimensions.x(), dimensions.y(), fetchData, fetchBoundary, fetchKernel, convolve, store ); } }; // template<> // struct Launcher< 3, TNL::Devices::Cuda > Loading Loading
src/Benchmarks/Convolution/CMakeLists.txt +1 −0 Original line number Diff line number Diff line Loading @@ -23,3 +23,4 @@ endif() endfunction() GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_solver.h" "kernels/naive.h") GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_solver.h" "kernels/naive.h")
src/Benchmarks/Convolution/kernels/naive.h +54 −45 Original line number Diff line number Diff line Loading @@ -56,55 +56,64 @@ convolution1D( Index kernelWidth, store( ix, result ); } // template<> // struct Convolution< 2, TNL::Devices::Cuda > // { // public: // template< typename Index > // static size_t // getDynamicSharedMemorySize( Index kernelWidth, Index kernelHeight, Index endX, Index endY ) // { // return 0; // } // }; template<> struct Convolution< 2, TNL::Devices::Cuda > { public: template< typename Index > static size_t getDynamicSharedMemorySize( Index kernelWidth, Index kernelHeight, Index endX, Index endY ) { return 0; } }; // template< typename Index, // typename Real, // typename FetchData, // typename FetchBoundary, // typename FetchKernel, // typename Convolve, // typename Store > // __global__ // static void // convolution2D( Index kernelWidth, // Index kernelHeight, // Index endX, // Index endY, // FetchData& fetchData, // FetchBoundary& fetchBoundary, // FetchKernel& fetchKernel, // Convolve& convolve, // Store& store ) // { // int iy = threadIdx.y + blockIdx.y * blockDim.y; // int ix = threadIdx.x + blockIdx.x * blockDim.x; template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store > __global__ static void convolution2D( Index kernelWidth, Index kernelHeight, Index endX, Index endY, FetchData fetchData, FetchBoundary fetchBoundary, FetchKernel fetchKernel, Convolve convolve, Store store ) { Index iy = threadIdx.y + blockIdx.y * blockDim.y; Index ix = threadIdx.x + blockIdx.x * blockDim.x; // Real result = 0; Index radiusY = kernelHeight >> 1; Index radiusX = kernelHeight >> 1; // for( Index j = iy - kernelHeight; j <= iy + kernelHeight; j++ ) { // for( Index i = ix - kernelWidth; i <= ix + kernelWidth; i++ ) { // if( i < 0 || i >= endX || j < 0 || j >= endY ) { // result = convolve( result, fetchBoundary( i, j ) ); // } // else { // result = convolve( result, fetchData( i, j ), fetchKernel( i, j ) ); // } // } // } Real result = 0; // store( ix, iy, result ); // } for( Index j = - radiusY; j <= radiusY; j++ ) { Index elementIndexY = j + iy; Index kernelIndexY = j + radiusY; for( Index i = - radiusX; i <= radiusX; i++ ) { Index elementIndexX = i + ix; Index kernelIndexX = i + radiusX; if( elementIndexX < 0 || elementIndexX >= endX || elementIndexY < 0 || elementIndexY >= endY ) { result = convolve( result, fetchBoundary( elementIndexX, elementIndexY ), fetchKernel ( kernelIndexX, kernelIndexY ) ); } else { result = convolve( result, fetchData( elementIndexX, elementIndexY ), fetchKernel( kernelIndexX, kernelIndexY ) ); } } } store( ix, iy, result ); } // template<> // struct Convolution< 3, TNL::Devices::Cuda > Loading
src/Benchmarks/Convolution/support/DummyTask.h +44 −44 Original line number Diff line number Diff line Loading @@ -54,59 +54,59 @@ public: } }; // template< typename Index, typename Real > // struct DummyTask< Index, Real, 2, TNL::Devices::Cuda > // { // public: // static constexpr int Dimension = 2; // using Device = TNL::Devices::Cuda; // using Vector = TNL::Containers::StaticVector< Dimension, Index >; // using DataStore = typename TNL::Containers::Array< Real, Device, Index >::ViewType; // using Launcher = Launcher< Dimension, Device >; template< typename Index, typename Real > struct DummyTask< Index, Real, 2, TNL::Devices::Cuda > { public: static constexpr int Dimension = 2; using Device = TNL::Devices::Cuda; using Vector = TNL::Containers::StaticVector< Dimension, Index >; using DataStore = typename TNL::Containers::Array< Real, Device, Index >::ViewType; using Launcher = Launcher< Dimension, Device >; // static void // exec( const Vector& dimensions, const Vector& kernelSize, DataStore& input, DataStore& result, DataStore& kernel ) // { // auto fetchData = [ = ] __cuda_callable__( Index i, Index j ) // { // auto index = i + j * dimensions.x(); static void exec( const Vector& dimensions, const Vector& kernelSize, DataStore& input, DataStore& result, DataStore& kernel ) { auto fetchData = [ = ] __cuda_callable__( Index i, Index j ) { auto index = i + j * dimensions.x(); // return input[ index ]; // }; return input[ index ]; }; // auto fetchBoundary = [ = ] __cuda_callable__( Index i, Index j ) // { // return -1; // }; auto fetchBoundary = [ = ] __cuda_callable__( Index i, Index j ) { return -1; }; // auto fetchKernel = [ = ] __cuda_callable__( Index i, Index j ) // { // auto index = i + j * kernel.x(); auto fetchKernel = [ = ] __cuda_callable__( Index i, Index j ) { auto index = i + j * kernelSize.x(); // return kernel[ index ]; // }; return kernel[ index ]; }; // auto convolve = [ = ] __cuda_callable__( Real result, Index data, Index kernel ) // { // return result + data * kernel; // }; auto convolve = [ = ] __cuda_callable__( Real result, Index data, Index kernel ) { return result + data * kernel; }; // auto store = [ = ] __cuda_callable__( Index i, Index j, Real resultValue ) // { // auto index = i + j * dimensions.x(); auto store = [ = ] __cuda_callable__( Index i, Index j, Real resultValue ) mutable { auto index = i + j * dimensions.x(); // result[ index ] = resultValue; // }; result[ index ] = resultValue; }; // Launcher::exec< Index >( dimensions, // kernelSize, // std::forward< decltype( fetchData ) >( fetchData ), // std::forward< decltype( fetchBoundary ) >( fetchBoundary ), // std::forward< decltype( fetchKernel ) >( fetchKernel ), // std::forward< decltype( convolve ) >( convolve ), // std::forward< decltype( store ) >( store ) ); // } // }; Launcher::exec< Index, Real >( dimensions, kernelSize, std::forward< decltype( fetchData ) >( fetchData ), std::forward< decltype( fetchBoundary ) >( fetchBoundary ), std::forward< decltype( fetchKernel ) >( fetchKernel ), std::forward< decltype( convolve ) >( convolve ), std::forward< decltype( store ) >( store ) ); } }; // template< typename Index, typename Real > // struct DummyTask< Index, Real, 3, TNL::Devices::Cuda > Loading
src/Benchmarks/Convolution/support/Launcher.h +52 −57 Original line number Diff line number Diff line Loading @@ -59,70 +59,65 @@ public: } }; // template<> // struct Launcher< 2, TNL::Devices::Cuda > // { // public: // using Vector = TNL::Containers::StaticVector< 2, int >; // using ConvolutionKernel = Convolution< 2, TNL::Devices::Cuda >; // template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store > // static inline void // exec( const Vector& dimensions, // const Vector& kernelSize, // FetchData&& fetchData, // FetchBoundary&& fetchBoundary, // FetchKernel&& fetchKernel, // Convolve&& convolve, // Store&& store ) // { // TNL::Cuda::LaunchConfiguration launchConfig; // launchConfig.dynamicSharedMemorySize = ConvolutionKernel::getDynamicSharedMemorySize< Index >( // kernelSize.x(), kernelSize.y(), dimensions.x(), dimensions.y() ); template<> struct Launcher< 2, TNL::Devices::Cuda > { public: using Vector = TNL::Containers::StaticVector< 2, int >; using ConvolutionKernel = Convolution< 2, TNL::Devices::Cuda >; // const Index sizeX = dimensions.x(); // const Index sizeY = dimensions.y(); template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store > static inline void exec( const Vector& dimensions, const Vector& kernelSize, FetchData&& fetchData, FetchBoundary&& fetchBoundary, FetchKernel&& fetchKernel, Convolve&& convolve, Store&& store ) { TNL::Cuda::LaunchConfiguration launchConfig; // if( sizeX >= sizeY * sizeY ) { // launchConfig.blockSize.x = TNL::min( 256, sizeX ); // launchConfig.blockSize.y = 1; // } // else if( sizeY >= sizeX * sizeX ) { // launchConfig.blockSize.x = 1; // launchConfig.blockSize.y = TNL::min( 256, sizeY ); // } // else { // launchConfig.blockSize.x = TNL::min( 32, sizeX ); // launchConfig.blockSize.y = TNL::min( 8, sizeY ); // } launchConfig.dynamicSharedMemorySize = ConvolutionKernel::getDynamicSharedMemorySize< Index >( kernelSize.x(), kernelSize.y(), dimensions.x(), dimensions.y() ); // launchConfig.gridSize.x = // TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeX, launchConfig.blockSize.x ) ); // launchConfig.gridSize.y = // TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeY, launchConfig.blockSize.y ) ); const Index sizeX = dimensions.x(); const Index sizeY = dimensions.y(); // dim3 gridCount; if( sizeX >= sizeY * sizeY ) { launchConfig.blockSize.x = TNL::min( 256, sizeX ); launchConfig.blockSize.y = 1; } else if( sizeY >= sizeX * sizeX ) { launchConfig.blockSize.x = 1; launchConfig.blockSize.y = TNL::min( 256, sizeY ); } else { launchConfig.blockSize.x = TNL::min( 32, sizeX ); launchConfig.blockSize.y = TNL::min( 8, sizeY ); } // gridCount.x = roundUpDivision( sizeX, launchConfig.blockSize.x * launchConfig.gridSize.x ); // gridCount.y = roundUpDivision( sizeY, launchConfig.blockSize.y * launchConfig.gridSize.y ); launchConfig.gridSize.x = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeX, launchConfig.blockSize.x ) ); launchConfig.gridSize.y = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeY, launchConfig.blockSize.y ) ); // constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; // TNL::Cuda::launchKernel< true >( kernel, // 0, // launchConfig, // kernelSize.x(), // kernelSize.y(), // dimensions.x(), // dimensions.y(), // std::forward< FetchData >( fetchData ), // std::forward< FetchBoundary >( fetchBoundary ), // std::forward< FetchKernel >( fetchKernel ), // std::forward< Convolve >( convolve ), // std::forward< Store >( store ) ); // } // }; TNL::Cuda::launchKernel< true >( kernel, 0, launchConfig, kernelSize.x(), kernelSize.y(), dimensions.x(), dimensions.y(), fetchData, fetchBoundary, fetchKernel, convolve, store ); } }; // template<> // struct Launcher< 3, TNL::Devices::Cuda > Loading