Added benchmark of original LightSpMV implementation. (7955b02a) · Commits · TNL / tnl-dev

src/Benchmarks/SpMV/ReferenceFormats/Legacy/CSR.h

+13 −2

Original line number	Diff line number	Diff line
		@@ -53,6 +53,17 @@ union Block {

		Block() = default;

		template< typename Index2 >
		Block& operator=( const Block< Index2 >& source ) {
		index[ 0 ] = source.index[ 0 ];
		index[ 1 ] = source.index[ 1 ];
		for( int i = 0; i < ( sizeof(Index) == 4 ? 8 : 16); i ++ )
		byte[ i ] = source.byte[ i ];
		for( int i = 0; i < (sizeof(Index) == 4 ? 4 : 8); i++ )
		twobytes[ i ] = source.twobytes[ i ];
		return *this;
		}

		Index index[2]; // index[0] is row pointer, index[1] is index in warp
		uint8_t byte[sizeof(Index) == 4 ? 8 : 16]; // byte[7/15] is type specificator
		uint16_t twobytes[sizeof(Index) == 4 ? 4 : 8]; //twobytes[2/4] is maxID - minID
		@@ -262,8 +273,8 @@ public:
		// copy assignment
		CSR& operator=( const CSR& matrix );

		template< CSRKernel KernelType2 >
		CSR& operator=( const CSR< RealType, DeviceType, IndexType, KernelType2 >& matrix );
		template< typename IndexType2, CSRKernel KernelType2 >
		CSR& operator=( const CSR< RealType, DeviceType, IndexType2, KernelType2 >& matrix );

		// cross-device copy assignment
		template< typename Real2, typename Device2, typename Index2, CSRKernel KernelType2,

src/Benchmarks/SpMV/ReferenceFormats/Legacy/CSR_impl.h

+2 −2

Original line number	Diff line number	Diff line
		@@ -690,10 +690,10 @@ template< typename Real,
		typename Device,
		typename Index,
		CSRKernel KernelType >
		template< CSRKernel KernelType2 >
		template< typename IndexType2, CSRKernel KernelType2 >
		CSR< Real, Device, Index, KernelType >&
		CSR< Real, Device, Index, KernelType >::
		operator=( const CSR< Real, Device, Index, KernelType2 >& matrix )
		operator=( const CSR< Real, Device, IndexType2, KernelType2 >& matrix )
		{
		this->setLike( matrix );
		this->values = matrix.values;

src/Benchmarks/SpMV/ReferenceFormats/Legacy/Sparse_impl.h

+1 −1

Original line number	Diff line number	Diff line
		@@ -104,7 +104,7 @@ template< typename Real,
		typename Index >
		void Sparse< Real, Device, Index >::allocateMatrixElements( const IndexType& numberOfMatrixElements )
		{
		TNL_ASSERT_GE( numberOfMatrixElements, 0, "Number of matrix elements must be non-negative." );
		TNL_ASSERT_GE( numberOfMatrixElements, ( IndexType ) 0, "Number of matrix elements must be non-negative." );

		this->values.setSize( numberOfMatrixElements );
		this->columnIndexes.setSize( numberOfMatrixElements );

src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/Options.cu

0 → 100644

+381 −0

Original line number	Diff line number	Diff line
		/*
		* Options.cu
		*
		* Created on: Nov 24, 2014
		* Author: yongchao
		*/

		#include "Options.h"

		void Options::printUsage() {
		cerr << endl
		<< "LightSpMV (" << VERSION << ")"
		<< ": GPU-based sparse matrix-vector multiplication using CSR storate format"
		<< endl;
		cerr << "Usage: lightspmv -i matrix [options]" << endl << endl;
		cerr << "Options:" << endl;
		cerr << "Input:" << endl
		<< "\t-i <string> sparse matrix A file (in Matrix Market format)"
		<< endl
		<< "\t-x <string> vector X file (one element per line) [otherwise, set each element to 1.0]"
		<< endl
		<< "\t-y <string> vector Y file (one elemenet per line) [otherwise, set each element to 0.0]"
		<< endl << "Output:" << endl
		<< "\t-o <string> output file (one element per line) [otherwise, no output]"
		<< endl << "Compute:" << endl
		<< "\t-a <float> alpha value, default = " << _alpha << endl
		<< "\t-b <float> beta value, defualt = " << _beta << endl
		<< "\t-f <int> formula used, default = " << _formula << endl
		<< "\t 0: y = Ax" << endl << "\t 1: y = alpha * Ax + beta * y"
		<< endl << "\t-r <int> select the routine to use, default = "
		<< _routine << endl
		<< "\t 0: vector-based row dynamic distribution" << endl
		<< "\t 1: warp-based row dynamic distribution" << endl
		<< "\t-d <int> double-precision floating point, default = "
		<< (_singlePrecision ? 0 : 1) << endl
		<< "\t-g <int> index of the single GPU used, default = "
		<< _gpuIndex << endl
		<< "\t-m <int> number of SpMV iterations, default = " << _numIters
		<< endl << endl;
		}
		bool Options::parseArgs(int32_t argc, char* argv[]) {
		int32_t c;

		if (argc < 2) {
		printUsage();
		return false;
		}

		while ((c = getopt(argc, argv, "i:x:y:o:g:f:r:d:m:\n")) != -1) {
		switch (c) {
		case 'i':
		_mmFileName = optarg;
		break;
		case 'x':
		_vecXFileName = optarg;
		break;
		case 'y':
		_vecYFileName = optarg;
		break;
		case 'o':
		_outFileName = optarg;
		break;
		case 'a':
		_alpha = atof(optarg);
		break;
		case 'b':
		_beta = atof(optarg);
		break;
		case 'f':
		_formula = atoi(optarg);
		break;
		case 'g':
		_gpuIndex = atoi(optarg);
		if (_gpuIndex < 0) {
		_gpuIndex = 0;
		}
		break;
		case 'r':
		_routine = atoi(optarg);
		if (_routine < 0) {
		_routine = 0;
		}
		break;
		case 'd':
		_singlePrecision = atoi(optarg) ? false : true;
		break;
		case 'm':
		_numIters = atoi(optarg);
		if(_numIters < 1){
		_numIters = 1;
		}
		break;
		default:
		cerr << "Unknown parameter: " << optarg << endl;
		return false;
		}
		}

		/check the file length/
		if (_mmFileName.length() == 0) {
		cerr << "Matrix file should be specified" << endl;
		return false;
		}

		/load the list of GPUs/
		if (!getGPUs()) {
		return false;
		}

		/load the matrix/
		if (!loadMatrixMarketFile(_mmFileName.c_str())) {
		return false;
		}

		/load vector X/
		int64_t elementSize = _singlePrecision ? sizeof(float) : sizeof(double);
		int64_t numBytes = _numCols * elementSize;

		/allocate space/
		cudaMallocHost(&_vectorX, numBytes);
		CudaCheckError();

		/load the vector X/
		if (_vecXFileName.length() == 0) {
		/initialize X/
		cerr << "Initialize each element of vector X to 1.0" << endl;
		if (_singlePrecision) {
		float* p = (float*) _vectorX;
		for (uint32_t i = 0; i < _numCols; ++i) {
		p[i] = 1.0;
		}
		} else {
		double* p = (double*) _vectorX;
		for (uint32_t i = 0; i < _numCols; ++i) {
		p[i] = 1.0;
		}
		}
		} else {
		cerr << "Load vector X from file" << endl;
		/could not get the data/
		if (!loadVector(_vecXFileName, _vectorX, _numCols)) {
		return false;
		}
		}

		/load vector Y/
		numBytes = _numRows * elementSize;

		/allocate space/
		cudaMallocHost(&_vectorY, numBytes);
		CudaCheckError();

		/load the vector Y/
		if (_vecYFileName.length() == 0) {
		/initialize Y/
		cerr << "Initialize each element of vector Y to 0" << endl;

		memset(_vectorY, 0, numBytes);
		} else {
		cerr << "Load vector Y from file" << endl;
		/could not get the data/
		if (!loadVector(_vecYFileName, _vectorY, _numRows)) {
		return false;
		}
		}

		return true;
		}
		/convert the matrix market format to CSR/
		bool Options::loadMatrixMarketFile(const char* fileName) {
		uint64_t numBytes;

		cerr << "loading sparse matrix" << endl;
		if (_singlePrecision) {
		/create an empty CSR sparse matrix object/
		cusp::csr_matrix<uint32_t, float, cusp::host_memory> matrix;

		// load a matrix stored in MatrixMarket format
		cusp::io::read_matrix_market_file(matrix, fileName);

		/save the matrix information/
		_numRows = matrix.num_rows;
		_numCols = matrix.num_cols;
		_numValues = matrix.num_entries;

		/reserve memory/
		cudaMallocHost(&_rowOffsets, (_numRows + 1) * sizeof(uint32_t));
		CudaCheckError();

		cudaMallocHost(&_colIndexValues, _numValues * sizeof(uint32_t));
		CudaCheckError();

		cudaMallocHost(&_numericalValues, _numValues * sizeof(float));
		CudaCheckError();

		/copy the elements/
		numBytes = (_numRows + 1) * sizeof(uint32_t);
		cudaMemcpy(_rowOffsets, &matrix.row_offsets[0], numBytes,
		cudaMemcpyHostToHost);
		CudaCheckError();

		numBytes = _numValues * sizeof(uint32_t);
		cudaMemcpy(_colIndexValues, &matrix.column_indices[0], numBytes,
		cudaMemcpyHostToHost);
		CudaCheckError();

		numBytes = _numValues * sizeof(float);
		cudaMemcpy(_numericalValues, &matrix.values[0], numBytes,
		cudaMemcpyHostToHost);
		CudaCheckError();
		} else {
		/create an empty CSR sparse matrix object/
		cusp::csr_matrix<uint32_t, double, cusp::host_memory> matrix;

		// load a matrix stored in MatrixMarket format
		cusp::io::read_matrix_market_file(matrix, fileName);

		/save the matrix information/
		_numRows = matrix.num_rows;
		_numCols = matrix.num_cols;
		_numValues = matrix.num_entries;

		/reserve memory/
		cudaMallocHost(&_rowOffsets, (_numRows + 1) * sizeof(uint32_t));
		CudaCheckError();

		cudaMallocHost(&_colIndexValues, _numValues * sizeof(uint32_t));
		CudaCheckError();

		cudaMallocHost(&_numericalValues, _numValues * sizeof(double));
		CudaCheckError();

		/copy the elements/
		numBytes = (_numRows + 1) * sizeof(uint32_t);
		cudaMemcpy(_rowOffsets, &matrix.row_offsets[0], numBytes,
		cudaMemcpyHostToHost);
		CudaCheckError();

		numBytes = _numValues * sizeof(uint32_t);
		cudaMemcpy(_colIndexValues, &matrix.column_indices[0], numBytes,
		cudaMemcpyHostToHost);
		CudaCheckError();

		numBytes = _numValues * sizeof(double);
		cudaMemcpy(_numericalValues, &matrix.values[0], numBytes,
		cudaMemcpyHostToHost);
		CudaCheckError();
		}

		return true;
		}
		bool Options::loadVector(const string& fileName, void* vector,
		const uint32_t maxNumValues) {
		char buffer[1024];
		FILE* file;
		uint32_t pos;
		float* fptr = (float*) vector;
		double* dptr = (double*) vector;

		cerr << "loading vector X" << endl;
		/open the file/
		if (fileName.length() == 0) {
		return false;
		}
		file = fopen(fileName.c_str(), "r");
		if (!file) {
		cerr << "Failed to open file " << fileName << endl;
		return false;
		}

		/read the file/
		pos = 0;
		while (fgets(buffer, 1023, file)) {
		/remove the end of line/
		for (int32_t i = strlen(buffer) - 1;
		i >= 0 && (buffer[i] == '\n' \|\| buffer[i] == '\r'); --i) {
		buffer[i] = '\0';
		}
		if (strlen(buffer) == 0) {
		continue;
		}

		/get the number and save to vector/
		if (pos >= maxNumValues) {
		/already have enough numbers/
		break;
		}
		if (_singlePrecision) {
		float value;
		sscanf(buffer, "%f", &value);
		fptr[pos++] = value;
		} else {
		double value;
		sscanf(buffer, "%lf", &value);
		dptr[pos++] = value;
		}
		}
		if (pos < maxNumValues) {
		cerr << "Do not have enough numbers in the file" << endl;
		return false;
		}
		cerr << "Finished loading vector X" << endl;
		return true;
		}
		void Options::getRowSizeVariance() {
		double rowStart;
		uint32_t rowEnd;

		/compute the variance/
		_variance = 0;
		_mean = rint((double) _numValues / _numRows);
		rowStart = _rowOffsets[0];
		for (uint32_t i = 1; i <= _numRows; ++i) {
		rowEnd = _rowOffsets[i];
		_variance += (rowEnd - rowStart - _mean) * (rowEnd - rowStart - _mean);
		rowStart = rowEnd;
		}
		_variance = rint(sqrt(_variance / (_numRows > 1 ? _numRows - 1 : 1)));

		/information/
		cerr << "Rows: " << _numRows << " Cols: " << _numCols << " Elements: "
		<< _numValues << " Mean: " << _mean << " Standard deviation: "
		<< _variance << endl;
		}
		bool Options::getGPUs() {
		int32_t numGPUs;

		/get the number of GPUs/
		if (cudaGetDeviceCount(&numGPUs) != cudaSuccess) {
		cerr << "No CUDA-enabled GPU is available in the host" << endl;
		return false;
		}

		#if defined(HAVE_SM_35)
		cerr << "Require GPUs with compute capability >= 3.5" << endl;
		#else
		cerr << "Require GPUs with compute capability >= 3.0" << endl;
		#endif

		/iterate each GPU/
		cudaDeviceProp prop;
		for (int32_t i = 0; i < numGPUs; ++i) {

		/get the property of the device/
		cudaGetDeviceProperties(&prop, i);

		/check the major of the GPU/
		#if defined(HAVE_SM_35)
		if ((prop.major * 10 + prop.minor) >= 35) {
		#else
		if ((prop.major * 10 + prop.minor) >= 30) {
		#endif
		cerr << "GPU " << _gpus.size() << ": " << prop.name
		<< " (capability " << prop.major << "." << prop.minor << ")"
		<< endl;

		/save the Kepler GPU/
		_gpus.push_back(make_pair(i, prop));
		}
		}
		/check the number of qualified GPUs/
		if (_gpus.size() == 0) {
		cerr << "No qualified GPU is available" << endl;
		return false;
		}

		/check the GPU index/

		/reset the number of GPUs/
		if (_gpuIndex >= (int32_t) _gpus.size()) {
		_gpuIndex = _gpus.size() - 1;
		}
		if (_gpuIndex < 0) {
		_gpuIndex = 0;
		}

		/move the selected gpu to the first/
		swap(_gpus[0], _gpus[_gpuIndex]);

		return true;
		}

src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/Options.h

0 → 100644

+123 −0

Original line number	Diff line number	Diff line
		/*
		* Options.h
		*
		* Created on: Nov 21, 2014
		* Author: yongchao
		*/

		#ifndef OPTIONS_H_
		#define OPTIONS_H_

		#include "Types.h"
		//#include <cusp/io/matrix_market.h>

		struct Options {
		Options() {

		/input/
		_routine = 1;
		_formula = 1;
		_numIters = 1000;
		_singlePrecision = true;

		/matrix data/
		_numRows = 0;
		_numCols = 0;
		_rowOffsets = NULL;
		_numValues = 0;
		_colIndexValues = NULL;
		_numericalValues = NULL;
		_alpha = 1.0;
		_beta = 1.0;

		/vector data/
		_vectorX = NULL;
		_vectorY = NULL;

		/the number of GPUs/
		_numGPUs = 1;

		/GPU index used/
		_gpuIndex = 0;

		/for debug/
		_mean = 0;
		_variance = 0;
		}
		~Options() {
		if (_rowOffsets) {
		cudaFreeHost(_rowOffsets);
		}
		if (_colIndexValues) {
		cudaFreeHost(_colIndexValues);
		}
		if (_numericalValues) {
		cudaFreeHost(_numericalValues);
		}

		if (_vectorX) {
		cudaFreeHost(_vectorX);
		}
		if (_vectorY) {
		cudaFreeHost(_vectorY);
		}
		}

		/parse parameters/
		bool parseArgs(int32_t argc, char* argv[]);

		/load Matrix Market file/
		bool loadMatrixMarketFile(const char* fileName);

		/load vector/
		bool loadVector(const string& fileName, void* vector,
		const uint32_t maxNumValues);

		/print out usage/
		void printUsage();

		/get row distribution/
		void getRowSizeVariance();

		/retrieve GPU list/
		bool getGPUs();

		/input files/
		string _mmFileName;
		string _vecXFileName;
		string _vecYFileName;
		string _outFileName;
		bool _singlePrecision;
		int32_t _routine;
		int32_t _formula;
		int32_t _numIters;
		double _alpha;
		double _beta;

		/for debugging/
		double _mean;
		double _variance;

		/matrix data/
		uint32_t _numRows;
		uint32_t _numCols;
		uint32_t *_rowOffsets;
		uint32_t _numValues;
		uint32_t *_colIndexValues;
		void *_numericalValues;

		/vector data/
		void *_vectorX;
		void *_vectorY;

		/number of GPUs to be used/
		int32_t _numGPUs;

		/GPU index used/
		int32_t _gpuIndex;

		/GPU device list/
		vector<pair<int32_t, struct cudaDeviceProp> > _gpus;
		};

		#endif /* OPTIONS_H_ */