Tuning Ellpack format. (670f1892) · Commits · TNL / tnl-dev

src/matrices/tnlEllpackMatrix_impl.h

+70 −1

Original line number	Diff line number	Diff line
		@@ -115,6 +115,7 @@ bool tnlEllpackMatrix< Real, Device, Index >::setLike( const tnlEllpackMatrix< R
		if( ! tnlSparseMatrix< Real, Device, Index >::setLike( matrix ) )
		return false;
		this->rowLengths = matrix.rowLengths;
		this->alignedRows = matrix.alignedRows;
		return true;
		}

		@@ -125,6 +126,7 @@ void tnlEllpackMatrix< Real, Device, Index > :: reset()
		{
		tnlSparseMatrix< Real, Device, Index >::reset();
		this->rowLengths = 0;
		this->alignedRows = 0;
		}

		template< typename Real,
		@@ -693,6 +695,39 @@ class tnlEllpackMatrixDeviceDependentCode< tnlHost >
		}
		};

		#ifdef HAVE_CUDA
		template<
		typename Real,
		typename Index >
		__global__ void tnlEllpackMatrixVectorProductCudaKernel(
		const Index rows,
		const Index columns,
		const Index compressedRowsLengths,
		const Index alignedRows,
		const Index* columnIndexes,
		const Real* values,
		const Real* inVector,
		Real* outVector,
		const Index gridIdx )
		{
		const Index rowIdx = ( gridIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
		if( rowIdx >= rows )
		return;
		Index i = rowIdx;
		Index el( 0 );
		Real result( 0.0 );
		Index columnIndex;
		while( el++ < compressedRowsLengths && ( columnIndex = columnIndexes[ i ] ) < columns )
		{
		result += values[ i ] * inVector[ columnIndex ];
		i += alignedRows;
		}
		outVector[ rowIdx ] = result;
		}
		#endif



		template<>
		class tnlEllpackMatrixDeviceDependentCode< tnlCuda >
		{
		@@ -734,7 +769,41 @@ class tnlEllpackMatrixDeviceDependentCode< tnlCuda >
		const InVector& inVector,
		OutVector& outVector )
		{
		tnlMatrixVectorProductCuda( matrix, inVector, outVector );
		//tnlMatrixVectorProductCuda( matrix, inVector, outVector );
		#ifdef HAVE_CUDA
		typedef tnlEllpackMatrix< Real, Device, Index > Matrix;
		typedef typename Matrix::IndexType IndexType;
		//Matrix* kernel_this = tnlCuda::passToDevice( matrix );
		//InVector* kernel_inVector = tnlCuda::passToDevice( inVector );
		//OutVector* kernel_outVector = tnlCuda::passToDevice( outVector );
		dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() );
		const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
		const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() );
		for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
		{
		if( gridIdx == cudaGrids - 1 )
		cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize();
		tnlEllpackMatrixVectorProductCudaKernel
		< Real, Index >
		<<< cudaGridSize, cudaBlockSize >>>
		( matrix.getRows(),
		matrix.getColumns(),
		matrix.rowLengths,
		matrix.alignedRows,
		matrix.columnIndexes.getData(),
		matrix.values.getData(),
		inVector.getData(),
		outVector.getData(),
		gridIdx );
		checkCudaDevice;
		}
		//tnlCuda::freeFromDevice( kernel_this );
		//tnlCuda::freeFromDevice( kernel_inVector );
		//tnlCuda::freeFromDevice( kernel_outVector );
		checkCudaDevice;
		cudaThreadSynchronize();
		#endif

		}
		};

src/matrices/tnlSlicedEllpackMatrix_impl.h

+58 −14

Original line number	Diff line number	Diff line
		@@ -777,6 +777,42 @@ __global__ void tnlSlicedEllpackMatrix_computeMaximalRowLengthInSlices_CudaKerne
		}
		#endif

		#ifdef HAVE_CUDA
		template<
		typename Real,
		typename Index,
		int SliceSize >
		__global__ void tnlSlicedEllpackMatrixVectorProductCudaKernel(
		const Index rows,
		const Index columns,
		const Index* slicePointers,
		const Index* sliceCompressedRowsLengths,
		const Index* columnIndexes,
		const Real* values,
		const Real* inVector,
		Real* outVector,
		const Index gridIdx )
		{
		const Index rowIdx = ( gridIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
		if( rowIdx >= rows )
		return;
		const Index sliceIdx = rowIdx / SliceSize;
		const Index slicePointer = slicePointers[ sliceIdx ];
		const Index rowLength = sliceCompressedRowsLengths[ sliceIdx ];
		Index i = slicePointer + rowIdx - sliceIdx * SliceSize;
		const Index rowEnd = i + rowLength * SliceSize;
		Real result( 0.0 );
		Index columnIndex;
		while( i < rowEnd && ( columnIndex = columnIndexes[ i ] ) < columns )
		{
		result += values[ i ] * inVector[ columnIndex ];
		i += SliceSize;
		}
		outVector[ rowIdx ] = result;
		}
		#endif


		template<>
		class tnlSlicedEllpackMatrixDeviceDependentCode< tnlCuda >
		{
		@@ -862,13 +898,13 @@ class tnlSlicedEllpackMatrixDeviceDependentCode< tnlCuda >
		const InVector& inVector,
		OutVector& outVector )
		{
		tnlMatrixVectorProductCuda( matrix, inVector, outVector );
		/*#ifdef HAVE_CUDA
		//tnlMatrixVectorProductCuda( matrix, inVector, outVector );
		#ifdef HAVE_CUDA
		typedef tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize > Matrix;
		typedef typename Matrix::IndexType IndexType;
		Matrix* kernel_this = tnlCuda::passToDevice( matrix );
		InVector* kernel_inVector = tnlCuda::passToDevice( inVector );
		OutVector* kernel_outVector = tnlCuda::passToDevice( outVector );
		//Matrix* kernel_this = tnlCuda::passToDevice( matrix );
		//InVector* kernel_inVector = tnlCuda::passToDevice( inVector );
		//OutVector* kernel_outVector = tnlCuda::passToDevice( outVector );
		dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() );
		const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
		const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() );
		@@ -876,18 +912,26 @@ class tnlSlicedEllpackMatrixDeviceDependentCode< tnlCuda >
		{
		if( gridIdx == cudaGrids - 1 )
		cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize();
		tnlMatrixVectorProductCudaKernel<<< cudaGridSize, cudaBlockSize >>>
		( kernel_this,
		kernel_inVector,
		kernel_outVector,
		tnlSlicedEllpackMatrixVectorProductCudaKernel
		< Real, Index, SliceSize >
		<<< cudaGridSize, cudaBlockSize >>>
		( matrix.getRows(),
		matrix.getColumns(),
		matrix.slicePointers.getData(),
		matrix.sliceCompressedRowsLengths.getData(),
		matrix.columnIndexes.getData(),
		matrix.values.getData(),
		inVector.getData(),
		outVector.getData(),
		gridIdx );
		checkCudaDevice;
		}
		tnlCuda::freeFromDevice( kernel_this );
		tnlCuda::freeFromDevice( kernel_inVector );
		tnlCuda::freeFromDevice( kernel_outVector );
		//tnlCuda::freeFromDevice( kernel_this );
		//tnlCuda::freeFromDevice( kernel_inVector );
		//tnlCuda::freeFromDevice( kernel_outVector );
		checkCudaDevice;
		#endif*/
		cudaThreadSynchronize();
		#endif
		}

		};

tests/benchmarks/tnl-cuda-benchmarks.h

+55 −6

Original line number	Diff line number	Diff line
		@@ -22,12 +22,49 @@
		#include <core/vectors/tnlVector.h>
		#include <core/tnlTimerRT.h>
		#include <matrices/tnlSlicedEllpackMatrix.h>
		#include <matrices/tnlEllpackMatrix.h>

		#ifdef HAVE_CUBLAS
		//#include <cublas.h>
		#endif

		template< typename Matrix >
		__global__ void setCudaTestMatrixKernel( Matrix* matrix,
		const int elementsPerRow,
		const int gridIdx )
		{
		const int rowIdx = ( gridIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
		if( rowIdx >= matrix->getRows() )
		return;
		int col = Max( 0, rowIdx - elementsPerRow / 2 );
		for( int element = 0; element < elementsPerRow; element++ )
		{
		if( col + element < matrix->getColumns() )
		matrix->setElementFast( rowIdx, col + element, 1.0 );
		}
		}

		template< typename Matrix >
		void setCudaTestMatrix( Matrix& matrix,
		const int elementsPerRow )
		{
		typedef typename Matrix::IndexType IndexType;
		typedef typename Matrix::RealType RealType;
		Matrix* kernel_matrix = tnlCuda::passToDevice( matrix );
		dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() );
		const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
		const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() );
		for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
		{
		if( gridIdx == cudaGrids - 1 )
		cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize();
		setCudaTestMatrixKernel< Matrix >
		<<< cudaGridSize, cudaBlockSize >>>
		( kernel_matrix, elementsPerRow, gridIdx );
		checkCudaDevice;
		}
		tnlCuda::freeFromDevice( kernel_matrix );
		}

		int main( int argc, char* argv[] )
		{
		@@ -195,8 +232,9 @@ int main( int argc, char* argv[] )
		* Sliced Ellpack test
		*/
		const int elementsPerRow( 5 );
		tnlSlicedEllpackMatrix< double, tnlHost > hostMatrix;
		tnlSlicedEllpackMatrix< double, tnlCuda > deviceMatrix;
		typedef tnlEllpackMatrix< double, tnlCuda > DeviceMatrix;
		tnlEllpackMatrix< double, tnlHost > hostMatrix;
		DeviceMatrix deviceMatrix;
		tnlVector< int, tnlHost, int > hostRowLengths;
		tnlVector< int, tnlCuda, int > deviceRowLengths;
		hostRowLengths.setSize( size );
		@@ -206,21 +244,28 @@ int main( int argc, char* argv[] )
		hostRowLengths.setValue( elementsPerRow );
		deviceRowLengths.setValue( elementsPerRow );
		hostMatrix.setCompressedRowsLengths( hostRowLengths );
		deviceMatrix.setCompressedRowsLengths( deviceRowLengths );
		if( ! deviceMatrix.setCompressedRowsLengths( deviceRowLengths ) )
		{
		cerr << "Unable to allocate matrix elements." << endl;
		return false;
		}
		int elements( 0 );
		for( int row = 0; row < size; row++ )
		{
		cout << "Row " << row << "/" << size << " \r" << flush;
		int col = Max( 0, row - elementsPerRow / 2 );
		for( int element = 0; element < elementsPerRow; element++ )
		{
		if( col + element < size )
		{
		hostMatrix.setElement( row, col + element, 1.0 );
		deviceMatrix.setElement( row, col + element, 1.0 );
		//deviceMatrix.setElement( row, col + element, 1.0 );
		elements++;
		}
		}
		}
		cout << endl;
		setCudaTestMatrix< DeviceMatrix >( deviceMatrix, elementsPerRow );
		datasetSize = loops * elements * sizeof( double ) / oneGB;
		hostVector.setValue( 1.0 );
		deviceVector.setValue( 1.0 );
		@@ -242,7 +287,11 @@ int main( int argc, char* argv[] )

		if( hostVector2 != deviceVector2 )
		{
		cerr << "Error in SliceEllpack Spmv kernel." << endl;
		cerr << "Error in SliceEllpack Spmv kernel at positions" << endl;
		//for( int i = 0; i < size; i++ )
		// if( hostVector2.getElement( i ) != deviceVector2.getElement( i ) )
		// cerr << " " << i;

		}
		bandwidth = 2 * datasetSize / loops / timer.getTime();
		cout << timer.getTime() << " => " << bandwidth << " GB/s" << " speedup " << hostTime / timer.getTime() << endl;

src/matrices/tnlEllpackMatrix.h

+1 −1

File changed.

Contains only whitespace changes.