diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h index ea008fdc7c6efc3ae75b76142b4f318dce823791..113008ad080edc6611e36a8d6ed9b0391ea49110 100644 --- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h +++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h @@ -32,8 +32,6 @@ struct CSRAdaptiveKernelView CSRAdaptiveKernelView() = default; - CSRAdaptiveKernelView( BlocksType& blocks ); - void setBlocks( BlocksType& blocks, const int idx ); ViewType getView(); diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp index d4a369f252ffd61be1ade8589a3070bccfc032f7..427b5eba7b2d3ab11eb9a09617eff873a03c4cea 100644 --- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp +++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp @@ -26,11 +26,7 @@ namespace TNL { #ifdef HAVE_CUDA -template< int warpSize, - int WARPS, - int SHARED_PER_WARP, - int MAX_ELEM_PER_WARP, - typename BlocksView, +template< typename BlocksView, typename Offsets, typename Index, typename Fetch, @@ -50,21 +46,19 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, Real zero, Args... args ) { - static constexpr int CudaBlockSize = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize(); - //constexpr int WarpSize = Cuda::getWarpSize(); - //constexpr int WarpsCount = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::WarpsCount(); - //constexpr size_t StreamedSharedElementsPerWarp = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::StreamedSharedElementsPerWarp(); - - - __shared__ Real streamShared[ WARPS ][ SHARED_PER_WARP ]; - __shared__ Real multivectorShared[ CudaBlockSize / warpSize ]; - constexpr size_t MAX_X_DIM = 2147483647; - const Index index = (gridIdx * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; - const Index blockIdx = index / warpSize; + constexpr int CudaBlockSize = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize(); + constexpr int WarpSize = Cuda::getWarpSize(); + constexpr int WarpsCount = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::WarpsCount(); + constexpr size_t StreamedSharedElementsPerWarp = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::StreamedSharedElementsPerWarp(); + + __shared__ Real streamShared[ WarpsCount ][ StreamedSharedElementsPerWarp ]; + __shared__ Real multivectorShared[ CudaBlockSize / WarpSize ]; + const Index index = ( ( gridIdx * TNL::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x ) + threadIdx.x; + const Index blockIdx = index / WarpSize; if( blockIdx >= blocks.getSize() - 1 ) return; - if( threadIdx.x < CudaBlockSize / warpSize ) + if( threadIdx.x < CudaBlockSize / WarpSize ) multivectorShared[ threadIdx.x ] = zero; Real result = zero; bool compute( true ); @@ -80,7 +74,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, const Index end = begin + block.getSize(); // Stream data to shared memory - for( Index globalIdx = laneIdx + begin; globalIdx < end; globalIdx += warpSize ) + for( Index globalIdx = laneIdx + begin; globalIdx < end; globalIdx += WarpSize ) { streamShared[ warpIdx ][ globalIdx - begin ] = //fetch( globalIdx, compute ); details::FetchLambdaAdapter< Index, Fetch >::call( fetch, -1, -1, globalIdx, compute ); @@ -90,7 +84,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, const Index lastSegmentIdx = firstSegmentIdx + block.getSegmentsInBlock(); - for( Index i = firstSegmentIdx + laneIdx; i < lastSegmentIdx; i += warpSize ) + for( Index i = firstSegmentIdx + laneIdx; i < lastSegmentIdx; i += WarpSize ) { const Index sharedEnd = offsets[ i + 1 ] - begin; // end of preprocessed data result = zero; @@ -105,7 +99,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, const Index end = begin + block.getSize(); const Index segmentIdx = block.getFirstSegment(); - for( Index globalIdx = begin + laneIdx; globalIdx < end; globalIdx += warpSize ) + for( Index globalIdx = begin + laneIdx; globalIdx < end; globalIdx += WarpSize ) result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); // fix local idx // Parallel reduction @@ -163,7 +157,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, // Reduction in multivectorShared if( block.getWarpIdx() == 0 && laneIdx < 16 ) { - constexpr int totalWarps = CudaBlockSize / warpSize; + constexpr int totalWarps = CudaBlockSize / WarpSize; if( totalWarps >= 32 ) { multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 16 ] ); @@ -199,14 +193,6 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, } #endif -/*template< typename Index, - typename Device > -CSRAdaptiveKernelView< Index, Device >:: -CSRAdaptiveKernelView( BlocksType& blocks ) -{ - this->blocks.bind( blocks ); -}*/ - template< typename Index, typename Device > void @@ -272,54 +258,28 @@ segmentsReduction( const OffsetsView& offsets, return; } - static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256; - - /* Max length of row to process one warp for CSR Light, MultiVector */ - //static constexpr Index MAX_ELEMENTS_PER_WARP = 384; - - /* Max length of row to process one warp for CSR Adaptive */ - //static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::MaxAdaptiveElementsPerWarp(); - - /* How many shared memory use per block in CSR Adaptive kernel */ - static constexpr Index SHARED_PER_BLOCK = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::StreamedSharedMemory(); - - /* Number of elements in shared memory */ - static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(Real); - - /* Number of warps in block for CSR Adaptive */ - static constexpr Index WARPS = THREADS_ADAPTIVE / 32; - - /* Number of elements in shared memory per one warp */ - static constexpr Index SHARED_PER_WARP = SHARED / WARPS; - - constexpr int warpSize = 32; - Index blocksCount; - const Index threads = THREADS_ADAPTIVE; - constexpr size_t MAX_X_DIM = 2147483647; + const Index threads = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize(); + constexpr size_t maxGridSize = TNL::Cuda::getMaxGridSize(); //2147483647; - /* Fill blocks */ - size_t neededThreads = this->blocksArray[ valueSizeLog ].getSize() * warpSize; // one warp per block - /* Execute kernels on device */ + // Fill blocks + size_t neededThreads = this->blocksArray[ valueSizeLog ].getSize() * TNL::Cuda::getWarpSize(); // one warp per block + // Execute kernels on device for (Index gridIdx = 0; neededThreads != 0; gridIdx++ ) { - if (MAX_X_DIM * threads >= neededThreads) + if( maxGridSize * threads >= neededThreads ) { - blocksCount = roundUpDivision(neededThreads, threads); + blocksCount = roundUpDivision( neededThreads, threads ); neededThreads = 0; } else { - blocksCount = MAX_X_DIM; - neededThreads -= MAX_X_DIM * threads; + blocksCount = maxGridSize; + neededThreads -= maxGridSize * threads; } segmentsReductionCSRAdaptiveKernel< - warpSize, - WARPS, - SHARED_PER_WARP, - details::CSRAdaptiveKernelParameters< sizeof( Real ) >::MaxAdaptiveElementsPerWarp(), BlocksView, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... >