diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h
index ea008fdc7c6efc3ae75b76142b4f318dce823791..113008ad080edc6611e36a8d6ed9b0391ea49110 100644
--- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h
+++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h
@@ -32,8 +32,6 @@ struct CSRAdaptiveKernelView
 
    CSRAdaptiveKernelView() = default;
 
-   CSRAdaptiveKernelView( BlocksType& blocks );
-
    void setBlocks( BlocksType& blocks, const int idx );
 
    ViewType getView();
diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp
index d4a369f252ffd61be1ade8589a3070bccfc032f7..427b5eba7b2d3ab11eb9a09617eff873a03c4cea 100644
--- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp
+++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp
@@ -26,11 +26,7 @@ namespace TNL {
 
 #ifdef HAVE_CUDA
 
-template< int warpSize,
-          int WARPS,
-          int SHARED_PER_WARP,
-          int MAX_ELEM_PER_WARP,
-          typename BlocksView,
+template< typename BlocksView,
           typename Offsets,
           typename Index,
           typename Fetch,
@@ -50,21 +46,19 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
                                     Real zero,
                                     Args... args )
 {
-   static constexpr int CudaBlockSize = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize();
-   //constexpr int WarpSize = Cuda::getWarpSize();
-   //constexpr int WarpsCount = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::WarpsCount();
-   //constexpr size_t StreamedSharedElementsPerWarp  = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::StreamedSharedElementsPerWarp();
-
-
-   __shared__ Real streamShared[ WARPS ][ SHARED_PER_WARP ];
-   __shared__ Real multivectorShared[ CudaBlockSize / warpSize ];
-   constexpr size_t MAX_X_DIM = 2147483647;
-   const Index index = (gridIdx * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
-   const Index blockIdx = index / warpSize;
+   constexpr int CudaBlockSize = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize();
+   constexpr int WarpSize = Cuda::getWarpSize();
+   constexpr int WarpsCount = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::WarpsCount();
+   constexpr size_t StreamedSharedElementsPerWarp  = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::StreamedSharedElementsPerWarp();
+
+   __shared__ Real streamShared[ WarpsCount ][ StreamedSharedElementsPerWarp ];
+   __shared__ Real multivectorShared[ CudaBlockSize / WarpSize ];
+   const Index index = ( ( gridIdx * TNL::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x ) + threadIdx.x;
+   const Index blockIdx = index / WarpSize;
    if( blockIdx >= blocks.getSize() - 1 )
       return;
 
-   if( threadIdx.x < CudaBlockSize / warpSize )
+   if( threadIdx.x < CudaBlockSize / WarpSize )
       multivectorShared[ threadIdx.x ] = zero;
    Real result = zero;
    bool compute( true );
@@ -80,7 +74,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
       const Index end = begin + block.getSize();
 
       // Stream data to shared memory
-      for( Index globalIdx = laneIdx + begin; globalIdx < end; globalIdx += warpSize )
+      for( Index globalIdx = laneIdx + begin; globalIdx < end; globalIdx += WarpSize )
       {
          streamShared[ warpIdx ][ globalIdx - begin ] = //fetch( globalIdx, compute );
             details::FetchLambdaAdapter< Index, Fetch >::call( fetch, -1, -1, globalIdx, compute );
@@ -90,7 +84,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
 
       const Index lastSegmentIdx = firstSegmentIdx + block.getSegmentsInBlock();
 
-      for( Index i = firstSegmentIdx + laneIdx; i < lastSegmentIdx; i += warpSize )
+      for( Index i = firstSegmentIdx + laneIdx; i < lastSegmentIdx; i += WarpSize )
       {
          const Index sharedEnd = offsets[ i + 1 ] - begin; // end of preprocessed data
          result = zero;
@@ -105,7 +99,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
       const Index end = begin + block.getSize();
       const Index segmentIdx = block.getFirstSegment();
 
-      for( Index globalIdx = begin + laneIdx; globalIdx < end; globalIdx += warpSize )
+      for( Index globalIdx = begin + laneIdx; globalIdx < end; globalIdx += WarpSize )
          result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); // fix local idx
 
       // Parallel reduction
@@ -163,7 +157,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
       // Reduction in multivectorShared
       if( block.getWarpIdx() == 0 && laneIdx < 16 )
       {
-         constexpr int totalWarps = CudaBlockSize / warpSize;
+         constexpr int totalWarps = CudaBlockSize / WarpSize;
          if( totalWarps >= 32 )
          {
             multivectorShared[ laneIdx ] =  reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 16 ] );
@@ -199,14 +193,6 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
 }
 #endif
 
-/*template< typename Index,
-          typename Device >
-CSRAdaptiveKernelView< Index, Device >::
-CSRAdaptiveKernelView( BlocksType& blocks )
-{
-   this->blocks.bind( blocks );
-}*/
-
 template< typename Index,
           typename Device >
 void
@@ -272,54 +258,28 @@ segmentsReduction( const OffsetsView& offsets,
       return;
    }
 
-   static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256;
-
-   /* Max length of row to process one warp for CSR Light, MultiVector */
-   //static constexpr Index MAX_ELEMENTS_PER_WARP = 384;
-
-   /* Max length of row to process one warp for CSR Adaptive */
-   //static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::MaxAdaptiveElementsPerWarp();
-
-   /* How many shared memory use per block in CSR Adaptive kernel */
-   static constexpr Index SHARED_PER_BLOCK = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::StreamedSharedMemory();
-
-   /* Number of elements in shared memory */
-   static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(Real);
-
-   /* Number of warps in block for CSR Adaptive */
-   static constexpr Index WARPS = THREADS_ADAPTIVE / 32;
-
-   /* Number of elements in shared memory per one warp */
-   static constexpr Index SHARED_PER_WARP = SHARED / WARPS;
-
-   constexpr int warpSize = 32;
-
    Index blocksCount;
 
-   const Index threads = THREADS_ADAPTIVE;
-   constexpr size_t MAX_X_DIM = 2147483647;
+   const Index threads = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize();
+   constexpr size_t maxGridSize = TNL::Cuda::getMaxGridSize(); //2147483647;
 
-   /* Fill blocks */
-   size_t neededThreads = this->blocksArray[ valueSizeLog ].getSize() * warpSize; // one warp per block
-   /* Execute kernels on device */
+   // Fill blocks 
+   size_t neededThreads = this->blocksArray[ valueSizeLog ].getSize() * TNL::Cuda::getWarpSize(); // one warp per block
+   // Execute kernels on device 
    for (Index gridIdx = 0; neededThreads != 0; gridIdx++ )
    {
-      if (MAX_X_DIM * threads >= neededThreads)
+      if( maxGridSize * threads >= neededThreads )
       {
-         blocksCount = roundUpDivision(neededThreads, threads);
+         blocksCount = roundUpDivision( neededThreads, threads );
          neededThreads = 0;
       }
       else
       {
-         blocksCount = MAX_X_DIM;
-         neededThreads -= MAX_X_DIM * threads;
+         blocksCount = maxGridSize;
+         neededThreads -= maxGridSize * threads;
       }
 
       segmentsReductionCSRAdaptiveKernel<
-            warpSize,
-            WARPS,
-            SHARED_PER_WARP,
-            details::CSRAdaptiveKernelParameters< sizeof( Real ) >::MaxAdaptiveElementsPerWarp(),
             BlocksView,
             OffsetsView,
             Index, Fetch, Reduction, ResultKeeper, Real, Args... >