Fixing a bug in host and device arrays comparison. (31ee230c) · Commits · TNL / tnl-dev

src/core/arrays/tnlArrayOperationsCuda_impl.h

+5 −2

Original line number	Diff line number	Diff line
		@@ -256,6 +256,9 @@ bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory( const Element1* dest
		const Element2* source,
		const Index size )
		{
		/***
		* Here, destination is on host and source is on CUDA device.
		*/
		tnlAssert( destination, );
		tnlAssert( source, );
		tnlAssert( size >= 0, cerr << "size = " << size );
		@@ -280,7 +283,7 @@ bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory( const Element1* dest
		delete[] host_buffer;
		return false;
		}
		if( ! tnlArrayOperations< tnlHost >::compareMemory( host_buffer, destination, transfer ) )
		if( ! tnlArrayOperations< tnlHost >::compareMemory( &destination[ compared ], host_buffer, transfer ) )
		{
		delete[] host_buffer;
		return false;

src/matrices/tnlEllpackMatrix_impl.h

+12 −9

Original line number	Diff line number	Diff line
		@@ -687,12 +687,12 @@ class tnlEllpackMatrixDeviceDependentCode< tnlHost >
		const InVector& inVector,
		OutVector& outVector )
		{
		//#ifdef HAVE_OPENMP
		//#pragma omp parallel for
		//#endif
		// for( Index row = 0; row < matrix.getRows(); row ++ )
		// outVector[ row ] = matrix.rowVectorProduct( row, inVector );
		Index col;
		#ifdef HAVE_OPENMP
		#pragma omp parallel for
		#endif
		for( Index row = 0; row < matrix.getRows(); row ++ )
		outVector[ row ] = matrix.rowVectorProduct( row, inVector );
		/*Index col;
		for( Index row = 0; row < matrix.getRows(); row ++ )
		{
		outVector[ row ] = 0.0;
		@@ -700,8 +700,7 @@ class tnlEllpackMatrixDeviceDependentCode< tnlHost >
		for( Index i = row * matrix.rowLengths; i < rowEnd; i++ )
		if( ( col = matrix.columnIndexes[ i ] ) < matrix.columns )
		outVector[ row ] += matrix.values[ i ] * inVector[ col ];
		}

		}*/
		}
		};

		@@ -714,6 +713,7 @@ __global__ void tnlEllpackMatrixVectorProductCudaKernel(
		const Index columns,
		const Index compressedRowsLengths,
		const Index alignedRows,
		const Index paddingIndex,
		const Index* columnIndexes,
		const Real* values,
		const Real* inVector,
		@@ -727,7 +727,9 @@ __global__ void tnlEllpackMatrixVectorProductCudaKernel(
		Index el( 0 );
		Real result( 0.0 );
		Index columnIndex;
		while( el++ < compressedRowsLengths && ( columnIndex = columnIndexes[ i ] ) < columns )
		while( el++ < compressedRowsLengths &&
		( columnIndex = columnIndexes[ i ] ) < columns &&
		columnIndex != paddingIndex )
		{
		result += values[ i ] * inVector[ columnIndex ];
		i += alignedRows;
		@@ -800,6 +802,7 @@ class tnlEllpackMatrixDeviceDependentCode< tnlCuda >
		matrix.getColumns(),
		matrix.rowLengths,
		matrix.alignedRows,
		matrix.getPaddingIndex(),
		matrix.columnIndexes.getData(),
		matrix.values.getData(),
		inVector.getData(),

src/matrices/tnlSlicedEllpackMatrix_impl.h

+5 −1

Original line number	Diff line number	Diff line
		@@ -787,6 +787,7 @@ __global__ void tnlSlicedEllpackMatrixVectorProductCudaKernel(
		const Index columns,
		const Index* slicePointers,
		const Index* sliceCompressedRowsLengths,
		const Index paddingIndex,
		const Index* columnIndexes,
		const Real* values,
		const Real* inVector,
		@@ -803,7 +804,9 @@ __global__ void tnlSlicedEllpackMatrixVectorProductCudaKernel(
		const Index rowEnd = i + rowLength * SliceSize;
		Real result( 0.0 );
		Index columnIndex;
		while( i < rowEnd && ( columnIndex = columnIndexes[ i ] ) < columns )
		while( i < rowEnd &&
		( columnIndex = columnIndexes[ i ] ) < columns &&
		columnIndex < paddingIndex )
		{
		result += values[ i ] * inVector[ columnIndex ];
		i += SliceSize;
		@@ -919,6 +922,7 @@ class tnlSlicedEllpackMatrixDeviceDependentCode< tnlCuda >
		matrix.getColumns(),
		matrix.slicePointers.getData(),
		matrix.sliceCompressedRowsLengths.getData(),
		matrix.getPaddingIndex(),
		matrix.columnIndexes.getData(),
		matrix.values.getData(),
		inVector.getData(),

tests/benchmarks/tnl-cuda-benchmarks.h

+12 −15

Original line number	Diff line number	Diff line
		@@ -36,11 +36,12 @@ __global__ void setCudaTestMatrixKernel( Matrix* matrix,
		const int rowIdx = ( gridIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
		if( rowIdx >= matrix->getRows() )
		return;
		int col = Max( 0, rowIdx - elementsPerRow / 2 );
		int col = rowIdx - elementsPerRow / 2;
		for( int element = 0; element < elementsPerRow; element++ )
		{
		if( col + element < matrix->getColumns() )
		matrix->setElementFast( rowIdx, col + element, 1.0 );
		if( col + element >= 0 &&
		col + element < matrix->getColumns() )
		matrix->setElementFast( rowIdx, col + element, element + 1 );
		}
		}

		@@ -152,7 +153,7 @@ int main( int argc, char* argv[] )
		timer.reset();
		timer.start();
		for( int i = 0; i < loops; i++ )
		resultDevice = deviceVector.scalarProduct( deviceVector );
		resultDevice = deviceVector.scalarProduct( deviceVector2 );
		timer.stop();
		bandwidth = 2 * datasetSize / timer.getTime();
		cout << "Time: " << timer.getTime() << " bandwidth: " << bandwidth << " GB/sec." << endl;
		@@ -194,6 +195,7 @@ int main( int argc, char* argv[] )
		timer.start();
		for( int i = 0; i < loops; i++ )
		resultDevice = deviceVector.lpNorm( 2.0 );

		timer.stop();
		bandwidth = datasetSize / timer.getTime();
		cout << "Time: " << timer.getTime() << " bandwidth: " << bandwidth << " GB/sec." << endl;
		@@ -254,12 +256,12 @@ int main( int argc, char* argv[] )
		{
		if( row % 100 == 0 )
		cout << "Row " << row << "/" << size << " \r" << flush;
		int col = Max( 0, row - elementsPerRow / 2 );
		int col = row - elementsPerRow / 2;
		for( int element = 0; element < elementsPerRow; element++ )
		{
		if( col + element < size )
		if( col + element >= 0 && col + element < size )
		{
		hostMatrix.setElement( row, col + element, 1.0 );
		hostMatrix.setElement( row, col + element, element + 1 );
		//deviceMatrix.setElement( row, col + element, 1.0 );
		elements++;
		}
		@@ -280,20 +282,15 @@ int main( int argc, char* argv[] )
		cout << timer.getTime() << " => " << bandwidth << " GB/s" << endl;

		cout << "Benchmarking SpMV on GPU: ";
		deviceVector2.setValue( 0.0 );
		timer.reset();
		for( int i = 0; i < loops; i++ )
		deviceMatrix.vectorProduct( deviceVector, deviceVector2 );
		timer.stop();
		//cout << hostVector2 << endl << deviceVector2 << endl;

		if( hostVector2 != deviceVector2 )
		{
		cerr << "Error in SliceEllpack Spmv kernel at positions" << endl;
		//for( int i = 0; i < size; i++ )
		// if( hostVector2.getElement( i ) != deviceVector2.getElement( i ) )
		// cerr << " " << i;
		cerr << "Error in Spmv kernel" << endl;

		}
		bandwidth = datasetSize / timer.getTime();
		cout << timer.getTime() << " => " << bandwidth << " GB/s" << " speedup " << hostTime / timer.getTime() << endl;

src/mesh/grids/tnlTraverser_Grid2D_impl.h

+1 −1

File changed.

Contains only whitespace changes.