diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp
index 2ddfcd65c8269431e43cfd6117749fa3dcbe4a1b..35424d93c22bc18b7525c13c1acf8c574c380aae 100644
--- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp
+++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp
@@ -46,6 +46,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
                                     Real zero,
                                     Args... args )
 {
+   using BlockType = details::CSRAdaptiveKernelBlockDescriptor< Index >;
    constexpr int CudaBlockSize = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize();
    constexpr int WarpSize = Cuda::getWarpSize();
    constexpr int WarpsCount = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::WarpsCount();
@@ -53,6 +54,8 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
 
    __shared__ Real streamShared[ WarpsCount ][ StreamedSharedElementsPerWarp ];
    __shared__ Real multivectorShared[ CudaBlockSize / WarpSize ];
+   __shared__ BlockType sharedBlocks[ WarpsCount ];
+
    const Index index = ( ( gridIdx * TNL::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x ) + threadIdx.x;
    const Index blockIdx = index / WarpSize;
    if( blockIdx >= blocks.getSize() - 1 )
@@ -63,14 +66,19 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
    Real result = zero;
    bool compute( true );
    const Index laneIdx = threadIdx.x & 31; // & is cheaper than %
-   const details::CSRAdaptiveKernelBlockDescriptor< Index > block = blocks[ blockIdx ];
+   const Index warpIdx = threadIdx.x / 32;
+   /*if( laneIdx == 0 )
+      sharedBlocks[ warpIdx ] = blocks[ blockIdx ];
+   __syncthreads();
+   const auto& block = sharedBlocks[ warpIdx ];*/
+   const BlockType block = blocks[ blockIdx ];
    const Index& firstSegmentIdx = block.getFirstSegment();
    const Index begin = offsets[ firstSegmentIdx ];
 
    const auto blockType = block.getType();
    if( blockType == details::Type::STREAM ) // Stream kernel - many short segments per warp
    {
-      const Index warpIdx = threadIdx.x / 32;
+
       const Index end = begin + block.getSize();
 
       // Stream data to shared memory