diff --git a/src/TNL/Algorithms/Segments/BiEllpackView.hpp b/src/TNL/Algorithms/Segments/BiEllpackView.hpp
index 2014ae3dc983dbf025a51a971deaa9dca90d1ef4..c45844fa23ef90b3957d4039e300a4793124adc1 100644
--- a/src/TNL/Algorithms/Segments/BiEllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/BiEllpackView.hpp
@@ -428,9 +428,9 @@ reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction&
          detail::BiEllpackreduceSegmentsKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, BlockDim  >
             <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
             ( *this, gridIdx, first, last, fetch, reduction, keeper, zero );
-         cudaThreadSynchronize();
-         TNL_CHECK_CUDA_DEVICE;
       }
+      cudaStreamSynchronize(0);
+      TNL_CHECK_CUDA_DEVICE;
 #endif
    }
 }
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp b/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp
index 6133a843844b089bc60a16da7181cc8149c14c2a..6f9ee48fe0fcf140ae341b28bb1661efd8c5c543 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp
@@ -460,6 +460,8 @@ reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction&
             <<< cudaGridSize, cudaBlockSize, sharedMemory  >>>
             ( *this, gridIdx, first, last, fetch, reduction, keeper, zero );
       }
+      cudaStreamSynchronize(0);
+      TNL_CHECK_CUDA_DEVICE;
 #endif
    }
 }
diff --git a/src/TNL/Algorithms/Segments/EllpackView.hpp b/src/TNL/Algorithms/Segments/EllpackView.hpp
index b5311d7939e1d49d826b6ba76bffb11af23aff0f..ca418e691f8066424ef0e47dc5227eeeb15962e9 100644
--- a/src/TNL/Algorithms/Segments/EllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/EllpackView.hpp
@@ -105,13 +105,16 @@ struct EllpackCudaReductionDispatcher
    exec( Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Index segmentSize )
    {
    #ifdef HAVE_CUDA
+      if( last <= first )
+         return;
       const Index segmentsCount = last - first;
       const Index threadsCount = segmentsCount * 32;
       const Index blocksCount = Cuda::getNumberOfBlocks( threadsCount, 256 );
       dim3 blockSize( 256 );
       dim3 gridSize( blocksCount );
       EllpackCudaReductionKernelFull<<< gridSize, blockSize >>>( first, last, fetch, reduction, keeper, zero, segmentSize );
-      cudaDeviceSynchronize();
+      cudaStreamSynchronize(0);
+      TNL_CHECK_CUDA_DEVICE;
    #endif
    }
 };
@@ -127,13 +130,16 @@ struct EllpackCudaReductionDispatcher< Index, Fetch, Reduction, ResultKeeper, Re
    exec( Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Index segmentSize )
    {
    #ifdef HAVE_CUDA
+      if( last <= first )
+         return;
       const Index segmentsCount = last - first;
       const Index threadsCount = segmentsCount * 32;
       const Index blocksCount = Cuda::getNumberOfBlocks( threadsCount, 256 );
       dim3 blockSize( 256 );
       dim3 gridSize( blocksCount );
       EllpackCudaReductionKernelCompact<<< gridSize, blockSize >>>( first, last, fetch, reduction, keeper, zero, segmentSize );
-      cudaDeviceSynchronize();
+      cudaStreamSynchronize(0);
+      TNL_CHECK_CUDA_DEVICE;
    #endif
    }
 };
diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernelView.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernelView.hpp
index 9c495fd70ad15b17db8219c69cc0638b0c42d7fe..86307e7540cddc5e0bf5acb55eac5e6ab08de722 100644
--- a/src/TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernelView.hpp
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernelView.hpp
@@ -272,6 +272,8 @@ struct CSRAdaptiveKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reductio
                zero,
                args... );
       }
+      cudaStreamSynchronize(0);
+      TNL_CHECK_CUDA_DEVICE;
 #endif
    }
 };
diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp
index 68198f995f4a8daecd67b8e05881432aacc4372d..7feb50feff212944ce6606451ae8db503206e810 100644
--- a/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp
@@ -175,7 +175,10 @@ void
 CSRHybridKernel< Index, Device, ThreadsInBlock >::
 init( const Offsets& offsets )
 {
+    TNL_ASSERT_GT( offsets.getSize(), 0, "offsets size must be strictly positive" );
     const Index segmentsCount = offsets.getSize() - 1;
+    if( segmentsCount <= 0 )
+       return;
     const Index elementsInSegment = std::ceil( ( double ) offsets.getElement( segmentsCount ) / ( double ) segmentsCount );
     this->threadsPerSegment = TNL::min( std::pow( 2, std::ceil( std::log2( elementsInSegment ) ) ), ThreadsInBlock ); //TNL::Cuda::getWarpSize() );
     TNL_ASSERT_GE( threadsPerSegment, 0, "" );
@@ -245,6 +248,8 @@ reduceSegments( const OffsetsView& offsets,
     TNL_ASSERT_LE( this->threadsPerSegment, ThreadsInBlock, "" );
 
 #ifdef HAVE_CUDA
+    if( last <= first )
+       return;
     const size_t threadsCount = this->threadsPerSegment * ( last - first );
     dim3 blocksCount, gridsCount, blockSize( ThreadsInBlock );
     TNL::Cuda::setupThreads( blockSize, blocksCount, gridsCount, threadsCount );
@@ -297,6 +302,8 @@ reduceSegments( const OffsetsView& offsets,
                 throw std::runtime_error( std::string( "Wrong value of threadsPerSegment: " ) + std::to_string( this->threadsPerSegment ) );
         }
     }
+    cudaStreamSynchronize(0);
+    TNL_CHECK_CUDA_DEVICE;
 #endif
 }
 
diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp
index 93d3e2800e9d6b6b9277306458ea50c0c897a4e0..0ceecc371642a82e23daa248be6a152c880198e3 100644
--- a/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp
@@ -424,6 +424,9 @@ struct CSRLightKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduce, Kee
                        const Index threadsPerSegment )
    {
 #ifdef HAVE_CUDA
+    if( last <= first )
+       return;
+
       const size_t threads = 128;
       Index blocks, groupSize;
 
@@ -500,8 +503,9 @@ struct CSRLightKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduce, Kee
                      grid, offsets, first, last, fetch, reduce, keep, zero );
          }*/
       }
+      cudaStreamSynchronize(0);
+      TNL_CHECK_CUDA_DEVICE;
 #endif
-
    }
 };
 
@@ -513,21 +517,24 @@ void
 CSRLightKernel< Index, Device >::
 init( const Offsets& offsets )
 {
+   TNL_ASSERT_GT( offsets.getSize(), 0, "offsets size must be strictly positive" );
    const Index segmentsCount = offsets.getSize() - 1;
+    if( segmentsCount <= 0 )
+       return;
 
    if( this->getThreadsMapping() == CSRLightAutomaticThreads )
    {
       const Index elementsInSegment = roundUpDivision( offsets.getElement( segmentsCount ), segmentsCount ); // non zeroes per row
       if( elementsInSegment <= 2 )
-         this->threadsPerSegment = 2;
+         setThreadsPerSegment( 2 );
       else if( elementsInSegment <= 4 )
-         this->threadsPerSegment = 4;
+         setThreadsPerSegment( 4 );
       else if( elementsInSegment <= 8 )
-         this->threadsPerSegment = 8;
+         setThreadsPerSegment( 8 );
       else if( elementsInSegment <= 16 )
-         this->threadsPerSegment = 16;
+         setThreadsPerSegment( 16 );
       else //if (nnz <= 2 * matrix.MAX_ELEMENTS_PER_WARP)
-         this->threadsPerSegment = 32; // CSR Vector
+         setThreadsPerSegment( 32 ); // CSR Vector
       //else
       //   threadsPerSegment = roundUpDivision(nnz, matrix.MAX_ELEMENTS_PER_WARP) * 32; // CSR MultiVector
    }
@@ -536,22 +543,18 @@ init( const Offsets& offsets )
    {
       const Index elementsInSegment = roundUpDivision( offsets.getElement( segmentsCount ), segmentsCount ); // non zeroes per row
       if( elementsInSegment <= 2 )
-         this->threadsPerSegment = 2;
+         setThreadsPerSegment( 2 );
       else if( elementsInSegment <= 4 )
-         this->threadsPerSegment = 4;
+         setThreadsPerSegment( 4 );
       else if( elementsInSegment <= 8 )
-         this->threadsPerSegment = 8;
+         setThreadsPerSegment( 8 );
       else if( elementsInSegment <= 16 )
-         this->threadsPerSegment = 16;
+         setThreadsPerSegment( 16 );
       else //if (nnz <= 2 * matrix.MAX_ELEMENTS_PER_WARP)
-         this->threadsPerSegment = 32; // CSR Vector
+         setThreadsPerSegment( 32 ); // CSR Vector
       //else
       //   threadsPerSegment = roundUpDivision(nnz, matrix.MAX_ELEMENTS_PER_WARP) * 32; // CSR MultiVector
    }
-
-   TNL_ASSERT_GE( this->threadsPerSegment, 0, "" );
-   TNL_ASSERT_LE( this->threadsPerSegment, 33, "" );
-
 }
 
 template< typename Index,
diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRVectorKernel.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRVectorKernel.hpp
index cf7d80af65b8bbefcbba0490b5658ae905fc9e6d..a070761f752e0e53372b3ef350743b42d0b4b59a 100644
--- a/src/TNL/Algorithms/Segments/Kernels/CSRVectorKernel.hpp
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRVectorKernel.hpp
@@ -140,6 +140,9 @@ reduceSegments( const OffsetsView& offsets,
                          Args... args )
 {
 #ifdef HAVE_CUDA
+    if( last <= first )
+       return;
+
     const Index warpsCount = last - first;
     const size_t threadsCount = warpsCount * TNL::Cuda::getWarpSize();
     dim3 blocksCount, gridsCount, blockSize( 256 );
@@ -152,9 +155,12 @@ reduceSegments( const OffsetsView& offsets,
         reduceSegmentsCSRKernelVector< OffsetsView, IndexType, Fetch, Reduction, ResultKeeper, Real, Args... >
         <<< gridSize, blockSize >>>(
             gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args... );
-    };
+    }
+    cudaStreamSynchronize(0);
+    TNL_CHECK_CUDA_DEVICE;
 #endif
 }
+
       } // namespace Segments
    }  // namespace Algorithms
 } // namespace TNL
diff --git a/src/UnitTests/Matrices/SparseMatrixTest.hpp b/src/UnitTests/Matrices/SparseMatrixTest.hpp
index 1716b0ab894d506fe8eed2f13eb29821cb2266cf..444da8666fbbb2d4a6615d59f8266f9d855fd157 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest.hpp
+++ b/src/UnitTests/Matrices/SparseMatrixTest.hpp
@@ -243,6 +243,9 @@ void test_SetDimensions()
 
    EXPECT_EQ( m.getRows(), 9 );
    EXPECT_EQ( m.getColumns(), 8 );
+
+   // test empty matrix
+   m.setDimensions( 0, 0 );
 }
 
 template< typename Matrix >