Loading src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp +3 −3 Original line number Diff line number Diff line Loading @@ -56,7 +56,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, __shared__ Real multivectorShared[ CudaBlockSize / WarpSize ]; //__shared__ BlockType sharedBlocks[ WarpsCount ]; const Index index = ( ( gridIdx * TNL::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x ) + threadIdx.x; const Index index = ( ( gridIdx * TNL::Cuda::getMaxGridXSize() + blockIdx.x ) * blockDim.x ) + threadIdx.x; const Index blockIdx = index / WarpSize; if( blockIdx >= blocks.getSize() - 1 ) return; Loading Loading @@ -237,8 +237,8 @@ struct CSRAdaptiveKernelSegmentsReductionDispatcher< Index, Device, Fetch, Reduc Index blocksCount; const Index threads = detail::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize(); constexpr size_t maxGridSize = TNL::Cuda::getMaxGridSize(); const Index threads = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize(); constexpr size_t maxGridSize = TNL::Cuda::getMaxGridXSize(); // Fill blocks size_t neededThreads = blocks.getSize() * TNL::Cuda::getWarpSize(); // one warp per block Loading src/TNL/Cuda/LaunchHelpers.h +15 −0 Original line number Diff line number Diff line Loading @@ -22,6 +22,21 @@ inline constexpr std::size_t getMaxGridSize() return 65535; } inline constexpr size_t getMaxGridXSize() { return 2147483647;//65535; } inline constexpr size_t getMaxGridYSize() { return 65535; } inline constexpr size_t getMaxGridZSize() { return 65535; } inline constexpr int getMaxBlockSize() { return 1024; Loading Loading
src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp +3 −3 Original line number Diff line number Diff line Loading @@ -56,7 +56,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, __shared__ Real multivectorShared[ CudaBlockSize / WarpSize ]; //__shared__ BlockType sharedBlocks[ WarpsCount ]; const Index index = ( ( gridIdx * TNL::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x ) + threadIdx.x; const Index index = ( ( gridIdx * TNL::Cuda::getMaxGridXSize() + blockIdx.x ) * blockDim.x ) + threadIdx.x; const Index blockIdx = index / WarpSize; if( blockIdx >= blocks.getSize() - 1 ) return; Loading Loading @@ -237,8 +237,8 @@ struct CSRAdaptiveKernelSegmentsReductionDispatcher< Index, Device, Fetch, Reduc Index blocksCount; const Index threads = detail::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize(); constexpr size_t maxGridSize = TNL::Cuda::getMaxGridSize(); const Index threads = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize(); constexpr size_t maxGridSize = TNL::Cuda::getMaxGridXSize(); // Fill blocks size_t neededThreads = blocks.getSize() * TNL::Cuda::getWarpSize(); // one warp per block Loading
src/TNL/Cuda/LaunchHelpers.h +15 −0 Original line number Diff line number Diff line Loading @@ -22,6 +22,21 @@ inline constexpr std::size_t getMaxGridSize() return 65535; } inline constexpr size_t getMaxGridXSize() { return 2147483647;//65535; } inline constexpr size_t getMaxGridYSize() { return 65535; } inline constexpr size_t getMaxGridZSize() { return 65535; } inline constexpr int getMaxBlockSize() { return 1024; Loading