diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp
index 40700c50f25ed5c50539bf0d61dc7a6330212cfc..a9f921c73cea8c52b473837be21b6802ebc64f1a 100644
--- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp
+++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp
@@ -53,16 +53,16 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
    constexpr size_t StreamedSharedElementsPerWarp  = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::StreamedSharedElementsPerWarp();
 
    __shared__ Real streamShared[ WarpsCount ][ StreamedSharedElementsPerWarp ];
-   //__shared__ Real multivectorShared[ CudaBlockSize / WarpSize ];
-   //__shared__ BlockType sharedBlocks[ WarpsCount ];
+   __shared__ Real multivectorShared[ CudaBlockSize / WarpSize ];
+   __shared__ BlockType sharedBlocks[ WarpsCount ];
 
    const Index index = ( ( gridIdx * TNL::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x ) + threadIdx.x;
    const Index blockIdx = index / WarpSize;
    if( blockIdx >= blocks.getSize() - 1 )
       return;
 
-   //if( threadIdx.x < CudaBlockSize / WarpSize )
-   //   multivectorShared[ threadIdx.x ] = zero;
+   if( threadIdx.x < CudaBlockSize / WarpSize )
+      multivectorShared[ threadIdx.x ] = zero;
    Real result = zero;
    bool compute( true );
    const Index laneIdx = threadIdx.x & 31; // & is cheaper than %
@@ -71,7 +71,8 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
    __syncthreads();
    const auto& block = sharedBlocks[ warpIdx ];*/
    const BlockType block = blocks[ blockIdx ];
-   const Index begin = offsets[ block.getFirstSegment() ];
+   const Index firstSegmentIdx = block.getFirstSegment();
+   const Index begin = offsets[ firstSegmentIdx ];
 
    if( block.getType() == details::Type::STREAM ) // Stream kernel - many short segments per warp
    {
@@ -80,12 +81,10 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
 
       // Stream data to shared memory
       for( Index globalIdx = laneIdx + begin; globalIdx < end; globalIdx += WarpSize )
-      {
          streamShared[ warpIdx ][ globalIdx - begin ] = fetch( globalIdx, compute );
-      }
-      //const Index lastSegmentIdx = firstSegmentIdx + block.getSegmentsInBlock();
+      const Index lastSegmentIdx = firstSegmentIdx + block.getSegmentsInBlock();
 
-      /*for( Index i = firstSegmentIdx + laneIdx; i < lastSegmentIdx; i += WarpSize )
+      for( Index i = firstSegmentIdx + laneIdx; i < lastSegmentIdx; i += WarpSize )
       {
          const Index sharedEnd = offsets[ i + 1 ] - begin; // end of preprocessed data
          result = zero;
@@ -93,9 +92,9 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
          for( Index sharedIdx = offsets[ i ] - begin; sharedIdx < sharedEnd; sharedIdx++ )
             result = reduce( result, streamShared[ warpIdx ][ sharedIdx ] );
          keep( i, result );
-      }*/
+      }
    }
-   /*else if( block.getType() == details::Type::VECTOR ) // Vector kernel - one segment per warp
+   else if( block.getType() == details::Type::VECTOR ) // Vector kernel - one segment per warp
    {
       const Index end = begin + block.getSize();
       const Index segmentIdx = block.getFirstSegment();
@@ -172,7 +171,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
             keep( segmentIdx, multivectorShared[ 0 ] );
          }
       }
-   }*/
+   }
 }
 #endif
 
diff --git a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h
index 96f1899b268596bc57ba395cec1556ab5fbdfff5..d2be8966453c9d1253720925cfea44545bfbbb96 100644
--- a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h
+++ b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h
@@ -22,11 +22,13 @@ enum class Type {
    VECTOR = 2
 };
 
+//#define CSR_ADAPTIVE_UNION
+
 #ifdef CSR_ADAPTIVE_UNION
 template< typename Index >
 union CSRAdaptiveKernelBlockDescriptor
 {
-   CSRAdaptiveKernelBlockDescriptor(Index row, Type type = Type::VECTOR, Index index = 0) noexcept
+   CSRAdaptiveKernelBlockDescriptor(Index row, Type type = Type::VECTOR, Index index = 0, uint8_t warpsCount = 0) noexcept
    {
       this->index[0] = row;
       this->index[1] = index;
@@ -80,6 +82,16 @@ union CSRAdaptiveKernelBlockDescriptor
       return ( twobytes[ sizeof( Index ) == 4 ? 3 : 5 ] & 0x3FFF );
    }
 
+   __cuda_callable__ uint8_t getWarpIdx() const
+   {
+      return index[ 1 ];
+   }
+
+   __cuda_callable__ uint8_t getWarpsCount() const
+   {
+      return 1;
+   }
+
    void print( std::ostream& str ) const
    {
       Type type = this->getType();
diff --git a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h
index 3fa0855cb66fca91440896bd0ab4c38046044948..0f00fbd808772b14bb11e85951566730c766e310 100644
--- a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h
+++ b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h
@@ -15,7 +15,7 @@ namespace TNL {
       namespace Segments {
          namespace details {
 
-static constexpr int CSRAdaptiveKernelParametersCudaBlockSizes[] = { 256, 256, 256, 128, 128, 128 };
+static constexpr int CSRAdaptiveKernelParametersCudaBlockSizes[] = { 256, 256, 256, 256, 256, 256 };
 
 template< int SizeOfValue = 1,
           int StreamedSharedMemory_ = 24576 >
@@ -25,7 +25,10 @@ struct CSRAdaptiveKernelParameters
 
    static constexpr int getSizeValueLogConstexpr( const int i );
 
+   static constexpr int getSizeOfValue() { return SizeOfValue; };
+
    static constexpr int SizeOfValueLog = getSizeValueLogConstexpr( SizeOfValue );
+
    static_assert( SizeOfValueLog < MaxValueSizeLog, "Parameter SizeOfValue is too large." );
 
    /**