diff --git a/src/TNL/Matrices/CSR.h b/src/TNL/Matrices/CSR.h
index 348f01592cf13a7061bc23b70d62e0a9405bfb07..c8f87553ab7bec948186ce4fd6f7ba5c1b9ef9d8 100644
--- a/src/TNL/Matrices/CSR.h
+++ b/src/TNL/Matrices/CSR.h
@@ -45,17 +45,17 @@ private:
 public:
 
    using RealType = Real;
-   //typedef Real RealType;
-   typedef Device DeviceType;
-   typedef Index IndexType;
+   using DeviceType = Device;
+   using IndexType = Index;
    typedef typename Sparse< RealType, DeviceType, IndexType >:: CompressedRowLengthsVector CompressedRowLengthsVector;
    typedef typename Sparse< RealType, DeviceType, IndexType >::ConstCompressedRowLengthsVectorView ConstCompressedRowLengthsVectorView;
    typedef CSR< Real, Device, Index > ThisType;
    typedef CSR< Real, Devices::Host, Index > HostType;
    typedef CSR< Real, Devices::Cuda, Index > CudaType;
    typedef Sparse< Real, Device, Index > BaseType;
-   typedef typename BaseType::MatrixRow MatrixRow;
+   //typedef typename BaseType::MatrixRow MatrixRow;
    
+   using MatrixRow = typename BaseType::MatrixRow;
    using ConstMatrixRow = typename BaseType::ConstMatrixRow;
    //using typename BaseType::ConstMatrixRow;
    //typedef SparseRow< const RealType, const IndexType > ConstMatrixRow;
diff --git a/src/TNL/Matrices/CSR_impl.h b/src/TNL/Matrices/CSR_impl.h
index e8324de778f67ca6ceb2786c7643d1a8cf6d4687..ad24fc699b1030a8b60892ac1941bcd4e19d04b0 100644
--- a/src/TNL/Matrices/CSR_impl.h
+++ b/src/TNL/Matrices/CSR_impl.h
@@ -133,16 +133,17 @@ Index CSR< Real, Device, Index >::getRowLengthFast( const IndexType row ) const
 
 #ifdef HAVE_CUDA
 // TODO: move to SparseRow
-template< typename MatrixRow >
+template< typename MatrixRow, typename Index >
 __global__
-void getNonZeroRowLengthCudaKernel( const MatrixRow row, typename MatrixRow::IndexType* result )
+void getNonZeroRowLengthCudaKernel( const MatrixRow row, Index* result )
 {
    int threadId = blockIdx.x * blockDim.x + threadIdx.x;
    if( threadId == 0 )
    {
-      result = row->getNonZeroElementsCount();
+      *result = row.getNonZeroElementsCount();
    }
 }
+#endif
 
 template< typename Real,
           typename Device,
@@ -156,34 +157,31 @@ Index CSR< Real, Device, Index >::getNonZeroRowLength( const IndexType row ) con
    }
    if( std::is_same< DeviceType, Devices::Cuda >::value )
    {
-      ConstMatrixRow matrixRow = this->getRow( row );
-      IndexType resultHost;
+      IndexType *cols = new IndexType[4];
+      std::cout << "crash1" << std::endl;
+      RealType *vals = new RealType[4];
+      std::cout << "crash2" << std::endl;
+      for( int i = 0; i < 4; i++ )
+      {
+          cols[i] = i;
+          vals[i] = 1.0;
+      }
+      std::cout << "crash3" << std::endl;
+      ConstMatrixRow matrixRow(cols, vals, 4, 1); // = this->getRow( row ); // If the program even compiles, this line fails because a segfault is thrown on the first line of getRow()
+      std::cout << "crash4" << std::endl;
+      IndexType resultHost ( 0 );
       IndexType* resultCuda = Devices::Cuda::passToDevice( resultHost );
-      getNonZeroRowLengthCudaKernel<<< 1, 1 >>>( row, &resultCuda );
-      resultHost = Devices::Cuda::passFromDevice( resultCuda );
+      std::cout << "resultCuda = " << resultCuda << std::endl;
+      // PROBLEM: If thee second parameter of getNonZeroRowLengthCudaKernel is '&resultCuda', the following issue is thrown:
+      //          'error: no instance of function template "TNL::Matrices::getNonZeroRowLengthCudaKernel" matches the argument list'
+      /*TNL::Matrices::*/getNonZeroRowLengthCudaKernel< ConstMatrixRow, IndexType ><<< 1, 1 >>>( matrixRow, resultCuda ); // matrixRow works fine, tested them both separately
+      std::cout << "resultCuda = " << resultCuda << std::endl;
+      std::cout << "crash5" << std::endl;
+      resultHost = Devices::Cuda::passFromDevice( resultCuda ); // This causes a crash: Illegal memory address.
+      std::cout << "crash6" << std::endl;
       Devices::Cuda::freeFromDevice( resultCuda );
       return resultHost;
    }
-   
-    // getRow() was throwing segmentation faults.
-    // FOR THIS TO WORK, I had to change getRow() from [ rowIndex ] to .getElement( rowIndex ).
-    
-    
-    // THE FOLLOWING throws: /home/lukas/tnl-dev/src/TNL/ParallelFor.h(92): error: identifier "" is undefined in device code
-//    static IndexType elementCount ( 0 );
-//    ConstMatrixRow matrixRow = this->getRow( row );
-//    
-//    elementCount = 0; // Make sure it is reset. Without this seemingly useless step, it returned incorrect values.
-//    
-//    auto computeNonZeros = [matrixRow] __cuda_callable__ ( IndexType i ) mutable
-//    {
-//        if( matrixRow.getElementValue( i ) != 0.0 )
-//            elementCount++;
-//    };
-//    
-//    ParallelFor< DeviceType >::exec( (IndexType) 0, matrixRow.getLength(), computeNonZeros );
-//    
-//    return elementCount;
 }
 
 template< typename Real,
@@ -195,7 +193,6 @@ Index CSR< Real, Device, Index >::getNonZeroRowLengthFast( const IndexType row )
    ConstMatrixRow matrixRow = this->getRow( row );
    return matrixRow.getNonZeroElementsCount();
 }
-#endif
 
 template< typename Real,
           typename Device,
diff --git a/src/TNL/Matrices/SparseRow.h b/src/TNL/Matrices/SparseRow.h
index 6407d4a526303c70124dd03be48d72e9eab0e565..fac855eae71a26cdb4dbf62f927f12d0f24b5af1 100644
--- a/src/TNL/Matrices/SparseRow.h
+++ b/src/TNL/Matrices/SparseRow.h
@@ -55,9 +55,6 @@ class SparseRow
       __cuda_callable__
       Index getLength() const;
       
-//      __global__ 
-//      void getNonZeroRowLengthCudaKernel( const MatrixRow row, typename MatrixRow::IndexType* result );
-      
       __cuda_callable__
       Index getNonZeroElementsCount() const;
 
diff --git a/src/TNL/Matrices/SparseRow_impl.h b/src/TNL/Matrices/SparseRow_impl.h
index 31d133c618a94a9804c79ff45849e46e9c95c77c..3157b6c963515434d1cb1ed478b6125236e2600a 100644
--- a/src/TNL/Matrices/SparseRow_impl.h
+++ b/src/TNL/Matrices/SparseRow_impl.h
@@ -113,7 +113,8 @@ getLength() const
 }
 
 //template< typename MatrixRow >
-//__global__ void getNonZeroRowLengthCudaKernel( const MatrixRow row, typename MatrixRow::IndexType* result )
+//__global__ 
+//void getNonZeroRowLengthCudaKernel( const MatrixRow row, typename MatrixRow::IndexType* result )
 //{
 //   int threadId = blockIdx.x * blockDim.x + threadIdx.x;
 //   if( threadId == 0 )
@@ -130,52 +131,16 @@ getNonZeroElementsCount() const
 {
     using NonConstIndex = typename std::remove_const< Index >::type;
     
-    // If this is static, it will trigger a illegal memory address
-    // How to get it into the lambda function?
     NonConstIndex elementCount ( 0 );
-    
-    
-//    using CudaType = typename TNL::Devices::Cuda;
-//    using HostType = typename TNL::Devices::Host;
-//    
-//    
-//    // elementCount = 0; // Only if it is static. Make sure it is reset. Without this seemingly useless step, it returned incorrect values.
-//    
-//    // PROBLEM: Lambda function with __cuda_callable__ CANNOT pass values by reference!!
-//    // INCORRECT ASSUMPTION!! PROBLEM: Lambda function which takes in anything via capture list, cannot return anything. (Maybe dont capture anything? pass this->values by parameter and return count?)
-//        // WRONG: https://stackoverflow.com/questions/38835154/lambda-function-capture-a-variable-vs-return-value?fbclid=IwAR0ybDD83LRWxkJsrcoSmGW2mbsMfhywmdZQkleqyjU-NOIwqkz8woihfXs
-//    auto computeNonZeros = [=] __cuda_callable__ ( NonConstIndex i /*, NonConstIndex *elementCount*/ ) mutable
-//    {
-//        //std::cout << "this->values[ i * step ] = " << this->values[ i * step ] << " != 0.0/n";
-//        if( this->values[ i * step ] != 0.0 )
-//            elementCount++;//*elementCount++;
-//        
-//        //std::cout << "End of lambda elementCount = " << elementCount << "/n";
-//        //return elementCount;
-//    };
-//    
-//    
-//    // Decide which ParallelFor will be executed, either Host or Cuda.
-//    if( deviceType == TNL::String( "Devices::Host" ) )
-//    {
-//        ParallelFor< HostType >::exec( ( NonConstIndex ) 0, length, computeNonZeros /*, &elementCount*/ );
-//    }
-//    
-//    else if( deviceType == TNL::String( "Cuda" ) )
-//    {
-//        ParallelFor< CudaType >::exec( ( NonConstIndex ) 0, length, computeNonZeros /*, &elementCount*/ );
-//    }
    
-    
-//    // THE FOLLOWING doesn't work on GPU
     for( NonConstIndex i = 0; i < length; i++ )
     {
-        std::cout << "this->values[ i * step ] = " << this->values[ i * step ] << " != 0.0" << std::endl;
+//        std::cout << "this->values[ i * step ] = " << this->values[ i * step ] << " != 0.0" << std::endl;
         if( this->values[ i * step ] != 0.0 ) // Returns the same amount of elements in a row as does getRowLength() in ChunkedEllpack. WHY?
             elementCount++;
     }
     
-     std::cout << "Element Count = " << elementCount << "\n";
+//     std::cout << "Element Count = " << elementCount << "\n";
     
     return elementCount;
 }