Deleted useless code, reformatted present code. Still have issues with... (bd294176) · Commits · TNL / tnl-dev

src/TNL/Matrices/CSR.h

+4 −4

Original line number	Diff line number	Diff line
		@@ -45,17 +45,17 @@ private:
		public:

		using RealType = Real;
		//typedef Real RealType;
		typedef Device DeviceType;
		typedef Index IndexType;
		using DeviceType = Device;
		using IndexType = Index;
		typedef typename Sparse< RealType, DeviceType, IndexType >:: CompressedRowLengthsVector CompressedRowLengthsVector;
		typedef typename Sparse< RealType, DeviceType, IndexType >::ConstCompressedRowLengthsVectorView ConstCompressedRowLengthsVectorView;
		typedef CSR< Real, Device, Index > ThisType;
		typedef CSR< Real, Devices::Host, Index > HostType;
		typedef CSR< Real, Devices::Cuda, Index > CudaType;
		typedef Sparse< Real, Device, Index > BaseType;
		typedef typename BaseType::MatrixRow MatrixRow;
		//typedef typename BaseType::MatrixRow MatrixRow;

		using MatrixRow = typename BaseType::MatrixRow;
		using ConstMatrixRow = typename BaseType::ConstMatrixRow;
		//using typename BaseType::ConstMatrixRow;
		//typedef SparseRow< const RealType, const IndexType > ConstMatrixRow;

src/TNL/Matrices/CSR_impl.h

+25 −28

Original line number	Diff line number	Diff line
		@@ -133,16 +133,17 @@ Index CSR< Real, Device, Index >::getRowLengthFast( const IndexType row ) const

		#ifdef HAVE_CUDA
		// TODO: move to SparseRow
		template< typename MatrixRow >
		template< typename MatrixRow, typename Index >
		__global__
		void getNonZeroRowLengthCudaKernel( const MatrixRow row, typename MatrixRow::IndexType* result )
		void getNonZeroRowLengthCudaKernel( const MatrixRow row, Index* result )
		{
		int threadId = blockIdx.x * blockDim.x + threadIdx.x;
		if( threadId == 0 )
		{
		result = row->getNonZeroElementsCount();
		*result = row.getNonZeroElementsCount();
		}
		}
		#endif

		template< typename Real,
		typename Device,
		@@ -156,34 +157,31 @@ Index CSR< Real, Device, Index >::getNonZeroRowLength( const IndexType row ) con
		}
		if( std::is_same< DeviceType, Devices::Cuda >::value )
		{
		ConstMatrixRow matrixRow = this->getRow( row );
		IndexType resultHost;
		IndexType *cols = new IndexType[4];
		std::cout << "crash1" << std::endl;
		RealType *vals = new RealType[4];
		std::cout << "crash2" << std::endl;
		for( int i = 0; i < 4; i++ )
		{
		cols[i] = i;
		vals[i] = 1.0;
		}
		std::cout << "crash3" << std::endl;
		ConstMatrixRow matrixRow(cols, vals, 4, 1); // = this->getRow( row ); // If the program even compiles, this line fails because a segfault is thrown on the first line of getRow()
		std::cout << "crash4" << std::endl;
		IndexType resultHost ( 0 );
		IndexType* resultCuda = Devices::Cuda::passToDevice( resultHost );
		getNonZeroRowLengthCudaKernel<<< 1, 1 >>>( row, &resultCuda );
		resultHost = Devices::Cuda::passFromDevice( resultCuda );
		std::cout << "resultCuda = " << resultCuda << std::endl;
		// PROBLEM: If thee second parameter of getNonZeroRowLengthCudaKernel is '&resultCuda', the following issue is thrown:
		// 'error: no instance of function template "TNL::Matrices::getNonZeroRowLengthCudaKernel" matches the argument list'
		/TNL::Matrices::/getNonZeroRowLengthCudaKernel< ConstMatrixRow, IndexType ><<< 1, 1 >>>( matrixRow, resultCuda ); // matrixRow works fine, tested them both separately
		std::cout << "resultCuda = " << resultCuda << std::endl;
		std::cout << "crash5" << std::endl;
		resultHost = Devices::Cuda::passFromDevice( resultCuda ); // This causes a crash: Illegal memory address.
		std::cout << "crash6" << std::endl;
		Devices::Cuda::freeFromDevice( resultCuda );
		return resultHost;
		}

		// getRow() was throwing segmentation faults.
		// FOR THIS TO WORK, I had to change getRow() from [ rowIndex ] to .getElement( rowIndex ).


		// THE FOLLOWING throws: /home/lukas/tnl-dev/src/TNL/ParallelFor.h(92): error: identifier "" is undefined in device code
		// static IndexType elementCount ( 0 );
		// ConstMatrixRow matrixRow = this->getRow( row );
		//
		// elementCount = 0; // Make sure it is reset. Without this seemingly useless step, it returned incorrect values.
		//
		// auto computeNonZeros = [matrixRow] __cuda_callable__ ( IndexType i ) mutable
		// {
		// if( matrixRow.getElementValue( i ) != 0.0 )
		// elementCount++;
		// };
		//
		// ParallelFor< DeviceType >::exec( (IndexType) 0, matrixRow.getLength(), computeNonZeros );
		//
		// return elementCount;
		}

		template< typename Real,
		@@ -195,7 +193,6 @@ Index CSR< Real, Device, Index >::getNonZeroRowLengthFast( const IndexType row )
		ConstMatrixRow matrixRow = this->getRow( row );
		return matrixRow.getNonZeroElementsCount();
		}
		#endif

		template< typename Real,
		typename Device,

src/TNL/Matrices/SparseRow.h

+0 −3

Original line number	Diff line number	Diff line
		@@ -55,9 +55,6 @@ class SparseRow
		__cuda_callable__
		Index getLength() const;

		// __global__
		// void getNonZeroRowLengthCudaKernel( const MatrixRow row, typename MatrixRow::IndexType* result );

		__cuda_callable__
		Index getNonZeroElementsCount() const;

src/TNL/Matrices/SparseRow_impl.h

+4 −39

Original line number	Diff line number	Diff line
		@@ -113,7 +113,8 @@ getLength() const
		}

		//template< typename MatrixRow >
		//__global__ void getNonZeroRowLengthCudaKernel( const MatrixRow row, typename MatrixRow::IndexType* result )
		//__global__
		//void getNonZeroRowLengthCudaKernel( const MatrixRow row, typename MatrixRow::IndexType* result )
		//{
		// int threadId = blockIdx.x * blockDim.x + threadIdx.x;
		// if( threadId == 0 )
		@@ -130,52 +131,16 @@ getNonZeroElementsCount() const
		{
		using NonConstIndex = typename std::remove_const< Index >::type;

		// If this is static, it will trigger a illegal memory address
		// How to get it into the lambda function?
		NonConstIndex elementCount ( 0 );


		// using CudaType = typename TNL::Devices::Cuda;
		// using HostType = typename TNL::Devices::Host;
		//
		//
		// // elementCount = 0; // Only if it is static. Make sure it is reset. Without this seemingly useless step, it returned incorrect values.
		//
		// // PROBLEM: Lambda function with __cuda_callable__ CANNOT pass values by reference!!
		// // INCORRECT ASSUMPTION!! PROBLEM: Lambda function which takes in anything via capture list, cannot return anything. (Maybe dont capture anything? pass this->values by parameter and return count?)
		// // WRONG: https://stackoverflow.com/questions/38835154/lambda-function-capture-a-variable-vs-return-value?fbclid=IwAR0ybDD83LRWxkJsrcoSmGW2mbsMfhywmdZQkleqyjU-NOIwqkz8woihfXs
		// auto computeNonZeros = [=] __cuda_callable__ ( NonConstIndex i /, NonConstIndex elementCount*/ ) mutable
		// {
		// //std::cout << "this->values[ i * step ] = " << this->values[ i * step ] << " != 0.0/n";
		// if( this->values[ i * step ] != 0.0 )
		// elementCount++;//*elementCount++;
		//
		// //std::cout << "End of lambda elementCount = " << elementCount << "/n";
		// //return elementCount;
		// };
		//
		//
		// // Decide which ParallelFor will be executed, either Host or Cuda.
		// if( deviceType == TNL::String( "Devices::Host" ) )
		// {
		// ParallelFor< HostType >::exec( ( NonConstIndex ) 0, length, computeNonZeros /, &elementCount/ );
		// }
		//
		// else if( deviceType == TNL::String( "Cuda" ) )
		// {
		// ParallelFor< CudaType >::exec( ( NonConstIndex ) 0, length, computeNonZeros /, &elementCount/ );
		// }


		// // THE FOLLOWING doesn't work on GPU
		for( NonConstIndex i = 0; i < length; i++ )
		{
		std::cout << "this->values[ i * step ] = " << this->values[ i * step ] << " != 0.0" << std::endl;
		// std::cout << "this->values[ i * step ] = " << this->values[ i * step ] << " != 0.0" << std::endl;
		if( this->values[ i * step ] != 0.0 ) // Returns the same amount of elements in a row as does getRowLength() in ChunkedEllpack. WHY?
		elementCount++;
		}

		std::cout << "Element Count = " << elementCount << "\n";
		// std::cout << "Element Count = " << elementCount << "\n";

		return elementCount;
		}