diff --git a/src/TNL/Matrices/CSR.h b/src/TNL/Matrices/CSR.h
index 423b40feff03c60869486c76bd330dd129b13df3..348f01592cf13a7061bc23b70d62e0a9405bfb07 100644
--- a/src/TNL/Matrices/CSR.h
+++ b/src/TNL/Matrices/CSR.h
@@ -13,6 +13,9 @@
 #include <TNL/Matrices/Sparse.h>
 #include <TNL/Containers/Vector.h>
 
+#include <TNL/Devices/Cuda.h>
+#include <TNL/Exceptions/CudaBadAlloc.h>
+
 namespace TNL {
 namespace Matrices {
    
@@ -80,8 +83,14 @@ public:
    __cuda_callable__
    IndexType getRowLengthFast( const IndexType row ) const;
    
+#ifdef HAVE_CUDA
+   //__device__
+   //void getNonZeroRowLengthCudaKernel( const MatrixRow row, typename MatrixRow::IndexType* result );
+   
    IndexType getNonZeroRowLength( const IndexType row ) const;
-
+   
+   IndexType getNonZeroRowLengthFast( const IndexType row ) const;
+#endif
    template< typename Real2, typename Device2, typename Index2 >
    void setLike( const CSR< Real2, Device2, Index2 >& matrix );
 
diff --git a/src/TNL/Matrices/CSR_impl.h b/src/TNL/Matrices/CSR_impl.h
index a77e68575b13ce5933dd7fbfd10a376402af48e5..e8324de778f67ca6ceb2786c7643d1a8cf6d4687 100644
--- a/src/TNL/Matrices/CSR_impl.h
+++ b/src/TNL/Matrices/CSR_impl.h
@@ -131,9 +131,11 @@ Index CSR< Real, Device, Index >::getRowLengthFast( const IndexType row ) const
    return this->rowPointers[ row + 1 ] - this->rowPointers[ row ];
 }
 
-// TODO: presunout do SparseRow
+#ifdef HAVE_CUDA
+// TODO: move to SparseRow
 template< typename MatrixRow >
-__global__ void getNonZeroRowLengthCudaKernel( const MatrixRow row, typename MatrixRow::IndexType* result )
+__global__
+void getNonZeroRowLengthCudaKernel( const MatrixRow row, typename MatrixRow::IndexType* result )
 {
    int threadId = blockIdx.x * blockDim.x + threadIdx.x;
    if( threadId == 0 )
@@ -193,6 +195,7 @@ Index CSR< Real, Device, Index >::getNonZeroRowLengthFast( const IndexType row )
    ConstMatrixRow matrixRow = this->getRow( row );
    return matrixRow.getNonZeroElementsCount();
 }
+#endif
 
 template< typename Real,
           typename Device,
diff --git a/src/TNL/Matrices/SparseRow.h b/src/TNL/Matrices/SparseRow.h
index fac855eae71a26cdb4dbf62f927f12d0f24b5af1..6407d4a526303c70124dd03be48d72e9eab0e565 100644
--- a/src/TNL/Matrices/SparseRow.h
+++ b/src/TNL/Matrices/SparseRow.h
@@ -55,6 +55,9 @@ class SparseRow
       __cuda_callable__
       Index getLength() const;
       
+//      __global__ 
+//      void getNonZeroRowLengthCudaKernel( const MatrixRow row, typename MatrixRow::IndexType* result );
+      
       __cuda_callable__
       Index getNonZeroElementsCount() const;
 
diff --git a/src/TNL/Matrices/SparseRow_impl.h b/src/TNL/Matrices/SparseRow_impl.h
index d83aad239271c44baffb1bc5181aa65224725bf4..31d133c618a94a9804c79ff45849e46e9c95c77c 100644
--- a/src/TNL/Matrices/SparseRow_impl.h
+++ b/src/TNL/Matrices/SparseRow_impl.h
@@ -112,6 +112,16 @@ getLength() const
    return length;
 }
 
+//template< typename MatrixRow >
+//__global__ void getNonZeroRowLengthCudaKernel( const MatrixRow row, typename MatrixRow::IndexType* result )
+//{
+//   int threadId = blockIdx.x * blockDim.x + threadIdx.x;
+//   if( threadId == 0 )
+//   {
+//      result = row->getNonZeroElementsCount();
+//   }
+//}
+
 template< typename Real, typename Index >
 __cuda_callable__
 Index