Changed implementation of method. Commiting for backup purposes. (7f24c2ed) · Commits · TNL / tnl-dev

src/TNL/Matrices/CSR_impl.h

+4 −12

Original line number	Diff line number	Diff line
		@@ -137,8 +137,7 @@ template< typename Real,
		Index CSR< Real, Device, Index >::getNonZeroRowLength( const IndexType row ) const
		{
		ConstMatrixRow matrixRow = this->getRow( row );
		IndexType count = matrixRow.getNonZeroElementsCount( TNL::String( Device::getDeviceType() ) );
		// return count;
		return matrixRow.getNonZeroElementsCount( TNL::String( Device::getDeviceType() ) );
		// getRow() was throwing segmentation faults.
		// FOR THIS TO WORK, I had to change getRow() from [ rowIndex ] to .getElement( rowIndex ).

		@@ -459,19 +458,12 @@ typename CSR< Real, Device, Index >::ConstMatrixRow
		CSR< Real, Device, Index >::
		getRow( const IndexType rowIndex ) const
		{
		const IndexType rowOffset = this->rowPointers.getElement( rowIndex );
		const IndexType rowLength = this->rowPointers.getElement( rowIndex + 1 ) - rowOffset;
		const IndexType rowOffset = this->rowPointers[ rowIndex ];
		const IndexType rowLength = this->rowPointers[ rowIndex + 1 ] - rowOffset;
		return ConstMatrixRow( &this->columnIndexes[ rowOffset ],
		&this->values[ rowOffset ],
		rowLength,
		1 );

		// const IndexType rowOffset = this->rowPointers[ rowIndex ];
		// const IndexType rowLength = this->rowPointers[ rowIndex + 1 ] - rowOffset;
		// return ConstMatrixRow( &this->columnIndexes[ rowOffset ],
		// &this->values[ rowOffset ],
		// rowLength,
		// 1 );
		}

		template< typename Real,

src/TNL/Matrices/SparseRow_impl.h

+36 −33

Original line number	Diff line number	Diff line
		@@ -118,9 +118,6 @@ Index
		SparseRow< Real, Index >::
		getNonZeroElementsCount( TNL::String deviceType ) const
		{
		using CudaType = typename TNL::Devices::Cuda;
		using HostType = typename TNL::Devices::Host;

		using NonConstIndex = typename std::remove_const< Index >::type;

		// If this is static, it will trigger a illegal memory address
		@@ -128,41 +125,47 @@ getNonZeroElementsCount( TNL::String deviceType ) const
		NonConstIndex elementCount ( 0 );


		// elementCount = 0; // Only if it is static. Make sure it is reset. Without this seemingly useless step, it returned incorrect values.

		// PROBLEM: Lambda function with __cuda_callable__ CANNOT pass values by reference!!
		// PROBLEM: Lambda function which takes in anything via capture list, cannot return anything. (Maybe dont capture anything? pass this->values by parameter and return count?)
		auto computeNonZeros = [=] __cuda_callable__ ( NonConstIndex i ) mutable
		{
		//std::cout << "this->values[ i * step ] = " << this->values[ i * step ] << " != 0.0/n";
		if( this->values[ i * step ] != 0.0 )
		elementCount++;

		//std::cout << "End of lambda elementCount = " << elementCount << "/n";
		};
		// using CudaType = typename TNL::Devices::Cuda;
		// using HostType = typename TNL::Devices::Host;
		//
		//
		// // elementCount = 0; // Only if it is static. Make sure it is reset. Without this seemingly useless step, it returned incorrect values.
		//
		// // PROBLEM: Lambda function with __cuda_callable__ CANNOT pass values by reference!!
		// // INCORRECT ASSUMPTION!! PROBLEM: Lambda function which takes in anything via capture list, cannot return anything. (Maybe dont capture anything? pass this->values by parameter and return count?)
		// // WRONG: https://stackoverflow.com/questions/38835154/lambda-function-capture-a-variable-vs-return-value?fbclid=IwAR0ybDD83LRWxkJsrcoSmGW2mbsMfhywmdZQkleqyjU-NOIwqkz8woihfXs
		// auto computeNonZeros = [=] __cuda_callable__ ( NonConstIndex i /, NonConstIndex elementCount*/ ) mutable
		// {
		// //std::cout << "this->values[ i * step ] = " << this->values[ i * step ] << " != 0.0/n";
		// if( this->values[ i * step ] != 0.0 )
		// elementCount++;//*elementCount++;
		//
		// //std::cout << "End of lambda elementCount = " << elementCount << "/n";
		// //return elementCount;
		// };
		//
		//
		// // Decide which ParallelFor will be executed, either Host or Cuda.
		// if( deviceType == TNL::String( "Devices::Host" ) )
		// {
		// ParallelFor< HostType >::exec( ( NonConstIndex ) 0, length, computeNonZeros /, &elementCount/ );
		// }
		//
		// else if( deviceType == TNL::String( "Cuda" ) )
		// {
		// ParallelFor< CudaType >::exec( ( NonConstIndex ) 0, length, computeNonZeros /, &elementCount/ );
		// }


		// Decide which ParallelFor will be executed, either Host or Cuda.
		if( deviceType == TNL::String( "Devices::Host" ) )
		{
		ParallelFor< HostType >::exec( ( NonConstIndex ) 0, length, computeNonZeros );
		}

		else if( deviceType == TNL::String( "Cuda" ) )
		// // THE FOLLOWING doesn't work on GPU
		for( NonConstIndex i = 0; i < length; i++ )
		{
		ParallelFor< CudaType >::exec( ( NonConstIndex ) 0, length, computeNonZeros );
		std::cout << "this->values[ i * step ] = " << this->values[ i * step ] << " != 0.0" << std::endl;
		if( this->values[ i * step ] != 0.0 ) // Returns the same amount of elements in a row as does getRowLength() in ChunkedEllpack. WHY?
		elementCount++;
		}


		// // THE FOLLOWING doesn't work on GPU
		// for( NonConstIndex i = 0; i < length; i++ )
		// {
		// std::cout << "this->values[ i * step ] = " << this->values[ i * step ] << " != 0.0" << std::endl;
		// if( this->values[ i * step ] != 0.0 ) // Returns the same amount of elements in a row as does getRowLength() in ChunkedEllpack. WHY?
		// elementCount++;
		// }

		// std::cout << "Element Count = " << elementCount << "\n";
		std::cout << "Element Count = " << elementCount << "\n";

		return elementCount;
		}