Attempted to fix non-working CUDA code for getting non-zero elements of (05234ee7) · Commits · TNL / tnl-dev

src/TNL/Matrices/CSR.h

+10 −1

Original line number	Diff line number	Diff line
		@@ -13,6 +13,9 @@
		#include <TNL/Matrices/Sparse.h>
		#include <TNL/Containers/Vector.h>

		#include <TNL/Devices/Cuda.h>
		#include <TNL/Exceptions/CudaBadAlloc.h>

		namespace TNL {
		namespace Matrices {

		@@ -80,8 +83,14 @@ public:
		__cuda_callable__
		IndexType getRowLengthFast( const IndexType row ) const;

		#ifdef HAVE_CUDA
		//__device__
		//void getNonZeroRowLengthCudaKernel( const MatrixRow row, typename MatrixRow::IndexType* result );

		IndexType getNonZeroRowLength( const IndexType row ) const;

		IndexType getNonZeroRowLengthFast( const IndexType row ) const;
		#endif
		template< typename Real2, typename Device2, typename Index2 >
		void setLike( const CSR< Real2, Device2, Index2 >& matrix );

+5 −2

Original line number	Diff line number	Diff line
		@@ -131,9 +131,11 @@ Index CSR< Real, Device, Index >::getRowLengthFast( const IndexType row ) const
		return this->rowPointers[ row + 1 ] - this->rowPointers[ row ];
		}

		// TODO: presunout do SparseRow
		#ifdef HAVE_CUDA
		// TODO: move to SparseRow
		template< typename MatrixRow >
		__global__ void getNonZeroRowLengthCudaKernel( const MatrixRow row, typename MatrixRow::IndexType* result )
		__global__
		void getNonZeroRowLengthCudaKernel( const MatrixRow row, typename MatrixRow::IndexType* result )
		{
		int threadId = blockIdx.x * blockDim.x + threadIdx.x;
		if( threadId == 0 )
		@@ -193,6 +195,7 @@ Index CSR< Real, Device, Index >::getNonZeroRowLengthFast( const IndexType row )
		ConstMatrixRow matrixRow = this->getRow( row );
		return matrixRow.getNonZeroElementsCount();
		}
		#endif

		template< typename Real,
		typename Device,

+3 −0

Original line number	Diff line number	Diff line
		@@ -55,6 +55,9 @@ class SparseRow
		__cuda_callable__
		Index getLength() const;

		// __global__
		// void getNonZeroRowLengthCudaKernel( const MatrixRow row, typename MatrixRow::IndexType* result );

		__cuda_callable__
		Index getNonZeroElementsCount() const;

+10 −0

Original line number	Diff line number	Diff line
		@@ -112,6 +112,16 @@ getLength() const
		return length;
		}

		//template< typename MatrixRow >
		//__global__ void getNonZeroRowLengthCudaKernel( const MatrixRow row, typename MatrixRow::IndexType* result )
		//{
		// int threadId = blockIdx.x * blockDim.x + threadIdx.x;
		// if( threadId == 0 )
		// {
		// result = row->getNonZeroElementsCount();
		// }
		//}

		template< typename Real, typename Index >
		__cuda_callable__
		Index