Removed useless comments in preparation for rebase. (9eb190f2) · Commits · TNL / tnl-dev

src/Benchmarks/SpMV/spmv.h

+7 −28

Original line number	Diff line number	Diff line
		@@ -59,8 +59,6 @@ std::string getMatrixFormat( const Matrix& matrix )
		return format;
		}

		// This function is not used currently (as of 17.03.19),
		// as the log takes care of printing and saving this information into the log file.
		// Print information about the matrix.
		template< typename Matrix >
		void printMatrixInfo( const Matrix& matrix,
		@@ -218,14 +216,6 @@ benchmarkSpMV( Benchmark & benchmark,
		// Setup cuSPARSE MetaData, since it has the same header as CSR,
		// and therefore will not get its own headers (rows, cols, speedup etc.) in log.
		// * Not setting this up causes (among other undiscovered errors) the speedup from CPU to GPU on the input format to be overwritten.

		// FIXME: How to include benchmark with different name under the same header as the current format being benchmarked???
		// FIXME: Does it matter that speedup show difference only between current test and first test?
		// Speedup shows difference between CPU and GPU-cuSPARSE, because in Benchmarks.h:
		// * If there is no baseTime, the resulting test time is set to baseTime.
		// * However, if there is a baseTime (from the CPU compared to GPU test),
		// baseTime isn't changed. If we change it in Benchmarks.h to compare
		// the speedup from the last test, it will mess up BLAS benchmarks etc.
		benchmark.setMetadataColumns( Benchmark::MetadataColumns({
		{ "matrix name", convertToString( getMatrixFileName( inputFileName ) ) },
		{ "non-zeros", convertToString( hostMatrix.getNumberOfNonzeroMatrixElements() ) },
		@@ -244,7 +234,6 @@ benchmarkSpMV( Benchmark & benchmark,
		resultcuSPARSEDeviceVector2 = deviceVector2;
		#endif

		//#ifdef COMPARE_RESULTS
		// Difference between GPU (curent format) and GPU-cuSPARSE results
		Real cuSparseDifferenceAbsMax = resultDeviceVector2.differenceAbsMax( resultcuSPARSEDeviceVector2 );
		Real cuSparseDifferenceLpNorm = resultDeviceVector2.differenceLpNorm( resultcuSPARSEDeviceVector2, 1 );
		@@ -274,15 +263,6 @@ benchmarkSpMV( Benchmark & benchmark,
		std::cout << GPUcuSparse_absMax << std::endl;
		std::cout << GPUcuSparse_lpNorm << std::endl;

		// FIXME: This isn't an elegant solution, it makes the log file very long.
		// benchmark.addErrorMessage( GPUcuSparse_absMax, 1 );
		// benchmark.addErrorMessage( GPUcuSparse_lpNorm, 1 );

		// benchmark.addErrorMessage( CPUxGPU_absMax, 1 );
		// benchmark.addErrorMessage( CPUxGPU_lpNorm, 1 );

		//#endif

		std::cout << std::endl;
		return true;
		}
		@@ -295,15 +275,14 @@ benchmarkSpmvSynthetic( Benchmark & benchmark,
		bool verboseMR )
		{
		bool result = true;
		// TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats)
		// result \|= benchmarkSpMV< Real, Matrices::CSR >( benchmark, inputFileName, verboseMR );
		// result \|= benchmarkSpMV< Real, Matrices::Ellpack >( benchmark, inputFileName, verboseMR );
		// result \|= benchmarkSpMV< Real, SlicedEllpack >( benchmark, inputFileName, verboseMR );
		// result \|= benchmarkSpMV< Real, Matrices::ChunkedEllpack >( benchmark, inputFileName, verboseMR );

		// AdEllpack/BiEllpack doesn't have cross-device assignment ('= operator') implemented yet
		result \|= benchmarkSpMV< Real, Matrices::AdEllpack >( benchmark, inputFileName, verboseMR );
		// result \|= benchmarkSpMV< Real, Matrices::BiEllpack >( benchmark, inputFileName, verboseMR );
		result \|= benchmarkSpMV< Real, Matrices::CSR >( benchmark, inputFileName, verboseMR );
		result \|= benchmarkSpMV< Real, Matrices::Ellpack >( benchmark, inputFileName, verboseMR );
		result \|= benchmarkSpMV< Real, SlicedEllpack >( benchmark, inputFileName, verboseMR );
		result \|= benchmarkSpMV< Real, Matrices::ChunkedEllpack >( benchmark, inputFileName, verboseMR );

		// AdEllpack is broken
		// result \|= benchmarkSpMV< Real, Matrices::AdEllpack >( benchmark, inputFileName, verboseMR );
		result \|= benchmarkSpMV< Real, Matrices::BiEllpack >( benchmark, inputFileName, verboseMR );
		return result;
		}

src/TNL/Matrices/AdEllpack.h

+4 −5

Original line number	Diff line number	Diff line
		@@ -81,13 +81,12 @@ public:
		std::cout << "HEAD==TAIL" << std::endl;
		else
		{
		// TEST
		for( warpInfo< MatrixType >* i = this->getHead(); i != this->getTail()->next; i = i->next )
		{
		if( i == this->getHead() );
		// std::cout << "Head:" << "\ti->localLoad = " << i->localLoad << "\ti->offset = " << i->offset << "\ti->rowOffset = " << i->rowOffset << std::endl;
		else if( i == this->getTail() );
		// std::cout << "Tail:" << "\ti->localLoad = " << i->localLoad << "\ti->offset = " << i->offset << "\ti->rowOffset = " << i->rowOffset << std::endl;
		if( i == this->getHead() )
		std::cout << "Head:" << "\ti->localLoad = " << i->localLoad << "\ti->offset = " << i->offset << "\ti->rowOffset = " << i->rowOffset << std::endl;
		else if( i == this->getTail() )
		std::cout << "Tail:" << "\ti->localLoad = " << i->localLoad << "\ti->offset = " << i->offset << "\ti->rowOffset = " << i->rowOffset << std::endl;
		else
		std::cout << "\ti->localLoad = " << i->localLoad << "\ti->offset = " << i->offset << "\ti->rowOffset = " << i->rowOffset << std::endl;
		}

src/TNL/Matrices/AdEllpack_impl.h

+7 −37

Original line number	Diff line number	Diff line
		@@ -147,10 +147,6 @@ warpList< MatrixType >::~warpList()
		delete temp;
		}
		delete this->head;

		// TEST
		// std::cout << "List destructor." << std::endl;
		// this->printList();
		}


		@@ -1169,46 +1165,31 @@ void AdEllpack< Real, Device, Index >::spmvCuda8( const InVector& inVector,

		IndexType i = 0;
		IndexType elementPtr = this->offset[ warpIdx ] + inWarpIdx;
		// Save the value, to save calling access every loop.
		const IndexType warpLoad = this->localLoad[ warpIdx ];

		// The unroll factor is 4, therefore if a warp has less than 4 localLoad, it cannot be unrolled
		// and must be calculated separately.
		if( warpLoad < 4 )
		{
		// While the helpful index of the warp localLoad is less than localLoad and the element index isn't
		// out of the matrix (would return the number of columns of the matrix)
		while( i < warpLoad &&
		this->columnIndexes[ elementPtr ] < this->getColumns() )
		{
		temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
		// For the current thread, shift the elements ptr by warpSize (to keep the thread on one row)
		elementPtr += this->warpSize;
		i++; // Increment the helpful localLoad index.
		i++;
		}
		}
		else // If the localLoad of the warp is unrollable.
		else
		{
		// Is the warpLoad divisible by 4 (4 - 1 for binary AND).
		// This will return how far it is from being divisible:
		// For 0 & 3 = 0; 1 & 3 = 1; 2 & 3 = 2; 3 & 3 = 3; 4 & 3 = 0, etc.
		IndexType alignUnroll = warpLoad & 3;

		// While the result of divisibility by 4 has not reached the point where it is divisble by 4.
		while( alignUnroll != 0 &&
		this->columnIndexes[ elementPtr ] < this->getColumns() )
		{
		temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ];
		elementPtr += this->warpSize;
		i++;
		// If alignUnroll not 0 (i.e. no. of NNZ elements is not divisible by 4), decrement alignUnroll until it is.
		// This will ensure that the i starting index with be incremented to the correct starting position for the unroll.
		alignUnroll--;
		}
		}

		// For those rows that have warpLoad < unroll factor, this for loop won't even get past the first condition.
		// Otherwise unroll.
		for( ; i < this->localLoad[ warpIdx ]; i += 4 )
		{
		#pragma unroll
		@@ -1222,7 +1203,6 @@ void AdEllpack< Real, Device, Index >::spmvCuda8( const InVector& inVector,
		}
		}

		// What is going on here? DOCUMENT
		if( ( inWarpIdx == 0 ) \|\| ( reduceMap[ threadIdx.x ] > reduceMap[ threadIdx.x - 1 ] ) )
		{
		IndexType end = ( warpIdx + 1 ) << 5;
		@@ -1265,15 +1245,6 @@ void AdEllpack< Real, Device, Index >::spmvCuda16( const InVector& inVector,
		IndexType elementPtr = this->offset[ warpIdx ] + inWarpIdx;
		const IndexType warpLoad = this->localLoad[ warpIdx ];

		// for( IndexType i = 0; i < warpLoad; i++ )
		// {
		// if( this->columnIndexes[ elementPtr ] < this->getColumns() )
		// {
		// temp[ threadIdx.x ] += this->values[ elementPtr] * inVector[ this->columnIndexes[ elementPtr ] ];
		// elementPtr += this->warpSize;
		// }
		// }

		if( warpLoad < 8 )
		{
		while( i < warpLoad &&
		@@ -1496,7 +1467,6 @@ public:
		InVector* kernel_inVector = Devices::Cuda::passToDevice( inVector );
		OutVector* kernel_outVector = Devices::Cuda::passToDevice( outVector );
		TNL_CHECK_CUDA_DEVICE;
		std::cout << "totalLoad = " << matrix.totalLoad << std::endl;
		if( matrix.totalLoad < 2 )
		{
		dim3 blockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
		@@ -1520,7 +1490,7 @@ public:
		Cuda::freeFromDevice( kernel_outVector );
		TNL_CHECK_CUDA_DEVICE;
		}
		else if( matrix.totalLoad < 4 ) // WORKS
		else if( matrix.totalLoad < 4 )
		{
		dim3 blockSize( 192 ), cudaGridSize( Cuda::getMaxGridSize() );
		IndexType cudaBlocks = roundUpDivision( matrix.reduceMap.getSize(), blockSize.x );
		@@ -1543,7 +1513,7 @@ public:
		Cuda::freeFromDevice( kernel_outVector );
		TNL_CHECK_CUDA_DEVICE;
		}
		else if( matrix.totalLoad < 8 ) // Maybe works?
		else if( matrix.totalLoad < 8 )
		{
		dim3 blockSize( 128 ), cudaGridSize( Cuda::getMaxGridSize() );
		IndexType cudaBlocks = roundUpDivision( matrix.reduceMap.getSize(), blockSize.x );
		@@ -1566,7 +1536,7 @@ public:
		Devices::Cuda::freeFromDevice( kernel_outVector );
		TNL_CHECK_CUDA_DEVICE;
		}
		else if( matrix.totalLoad < 16 ) // BROKEN
		else if( matrix.totalLoad < 16 )
		{
		dim3 blockSize( 128 ), cudaGridSize( Cuda::getMaxGridSize() );
		IndexType cudaBlocks = roundUpDivision( matrix.reduceMap.getSize(), blockSize.x );
		@@ -1589,7 +1559,7 @@ public:
		Cuda::freeFromDevice( kernel_outVector );
		TNL_CHECK_CUDA_DEVICE;
		}
		else // BROKEN
		else
		{
		dim3 blockSize( 96 ), cudaGridSize( Cuda::getMaxGridSize() );
		IndexType cudaBlocks = roundUpDivision( matrix.reduceMap.getSize(), blockSize.x );
		@@ -1606,7 +1576,7 @@ public:
		kernel_outVector,
		gridIdx );
		}
		TNL_CHECK_CUDA_DEVICE; // FREEZES right here on CHECK CUDA
		TNL_CHECK_CUDA_DEVICE;
		Devices::Cuda::freeFromDevice( kernel_this );
		Devices::Cuda::freeFromDevice( kernel_inVector );
		Devices::Cuda::freeFromDevice( kernel_outVector );

src/TNL/Matrices/BiEllpack_impl.h

+3 −4

Original line number	Diff line number	Diff line
		@@ -802,7 +802,6 @@ template< typename Real,
		void BiEllpack< Real, Device, Index >::printValues() const
		{
		for( Index i = 0; i < this->values.getSize(); i++ ) {
		// Random values are stored with the column index of getColumns(). e.g. a matrix has 4 columns, values are at column indexes 0, 1, 2, 3 and junk data at index 4.
		if( this->columnIndexes.getElement( i ) != this->getColumns() )
		std::cout << "values.getElement( " << i << " ) = " << this->values.getElement( i )
		<< "\tcolumnIndexes.getElement( " << i << " ) = " << this->columnIndexes.getElement( i ) << std::endl;

src/TNL/Matrices/CSR_impl.h

+2 −42

Original line number	Diff line number	Diff line
		@@ -124,41 +124,8 @@ template< typename Real,
		Index CSR< Real, Device, Index >::getNonZeroRowLength( const IndexType row ) const
		{
		// TODO: Fix/Implement
		throw Exceptions::NotImplementedError( "CSR::getNonZeroRowLength is not implemented." );
		// if( std::is_same< DeviceType, Devices::Host >::value )
		// {
		// ConstMatrixRow matrixRow = this->getRow( row );
		// return matrixRow.getNonZeroElementsCount();
		// }
		// if( std::is_same< DeviceType, Devices::Cuda >::value )
		// {
		// IndexType *cols = new IndexType[4];
		// RealType *vals = new RealType[4];
		// for( int i = 0; i < 4; i++ )
		// {
		// cols[i] = i;
		// vals[i] = 1.0;
		// }
		// ConstMatrixRow matrixRow(cols, vals, 4, 1);
		// // ConstMatrixRow matrixRow = this->getRow( row );// If the program even compiles, this line fails because a segfault is thrown on the first line of getRow()
		// // WHEN debugging with GDB:
		// // (gdb) p this->rowPointers[0]
		// // Could not find operator[].
		// // (gdb) p rowPointers.getElement(0)
		// // Attempt to take address of value not located in memory.
		// IndexType resultHost ( 0 );
		// IndexType* resultCuda = Cuda::passToDevice( resultHost );
		// // PROBLEM: If the second parameter of getNonZeroRowLengthCudaKernel is '&resultCuda', the following issue is thrown:
		// // 'error: no instance of function template "TNL::Matrices::getNonZeroRowLengthCudaKernel" matches the argument list'
		// TNL::Matrices::getNonZeroRowLengthCudaKernel< ConstMatrixRow, IndexType ><<< 1, 1 >>>( matrixRow, resultCuda ); // matrixRow works fine, tested them both separately
		// delete []cols;
		// delete []vals;
		// std::cout << "Checkpoint BEFORE passFromDevice" << std::endl;
		// resultHost = Cuda::passFromDevice( resultCuda ); // This causes a crash: Illegal memory address in Cuda_impl.h at TNL_CHECK_CUDA_DEVICE
		// std::cout << "Checkpoint AFTER passFromDevice" << std::endl;
		// Cuda::freeFromDevice( resultCuda );
		// return resultHost;
		// }
		TNL_ASSERT( false, std::cerr << "TODO: Fix/Implement" );
		return 0;
		}

		template< typename Real,
		@@ -223,13 +190,6 @@ bool CSR< Real, Device, Index >::addElementFast( const IndexType row,
		const RealType& value,
		const RealType& thisElementMultiplicator )
		{
		/*TNL_ASSERT( row >= 0 && row < this->rows &&
		column >= 0 && column <= this->rows,
		std::cerr << " row = " << row
		<< " column = " << column
		<< " this->rows = " << this->rows
		<< " this->columns = " << this-> columns );*/

		IndexType elementPtr = this->rowPointers[ row ];
		const IndexType rowEnd = this->rowPointers[ row + 1 ];
		IndexType col = 0;