Table of calculation added, print and verbose done. (fd737b29) · Commits · TNL / tnl-dev

src/Benchmarks/GEM/Matrix/Matrix.h

+12 −0

Original line number	Diff line number	Diff line
		@@ -37,6 +37,7 @@ class Matrix
		void setDimensions( const Index rows, const Index columns );

		void setCompressedRowLengths( CompressedRowLengthsVector& rowLengths ){ rowLengths.setValue(this->getNumberOfMatrixElements());}

		IndexType getNumberOfMatrixElements(){return this->getNumRows() * this->getNumColumns();}
		/**
		* Classic setElement method.
		@@ -85,6 +86,17 @@ class Matrix
		__cuda_callable__
		Index getNumColumns() const{return this->columns;}

		Index getNumNonzeros() {
		Index nonzeros = 0;
		Matrix< Real, TNL::Devices::Host, Index> m;
		m = *this;
		for( int i = 0; i < m.getNumRows(); i++ )
		for( int j = 0; j < m.getNumColumns(); j++ )
		if( m.getElement(i,j) != 0 )
		nonzeros++;
		return nonzeros;
		}

		/**
		* Function for switching 2 rows in matrix from column further. E.g. in matrix 5 x 5
		* switch rows row1 = 1 and row2 = 2 from column = 1 to columns = 5.

src/Benchmarks/GEM/gem.h

+44 −68

Original line number	Diff line number	Diff line
		@@ -2,7 +2,7 @@
		#include "Matrix/Matrix.h"
		#include "gem/GEM.h"
		#include "TNL/tnl-dev/src/TNL/Math.h"
		//#include "gem/GEMdevice.h"
		#include <typeinfo> // type printf
		#include <TNL/Matrices/MatrixReader.h>

		#include "TNL/tnl-dev/src/TNL/Cuda/CheckDevice.h"
		@@ -31,7 +31,7 @@ void readVector( Vector< Real, Devices::Host, Index >& vector_host, const String
		template < typename Real, typename Index, typename Device >
		void readMatrixVector( Matrix< Real, Device, Index>& matrix,
		Vector< Real, Device, Index >& vector,
		const String& matrixName, const String& vectorName );
		const String& matrixName, const String& vectorName, const int verbose );


		template < typename Real,
		@@ -46,19 +46,20 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve
		MatrixType matrix;
		VectorType vector;

		readMatrixVector( matrix, vector, matrixName, vectorName );
		readMatrixVector( matrix, vector, matrixName, vectorName, verbose );
		VectorType vectorResult( matrix.getNumRows() );

		// Computation
		double* time;
		time = new double[ loops ];
		double error = -1;

		std::cout << "Starting computation on " << device << endl;
		if( verbose > 1 )
		cout << "Starting computation on " << device << endl;
		for( int i = 0; i < loops; i++ )
		{
		MatrixType matrixComp = matrix;
		VectorType vectorComp( vector );
		vectorComp.setValue( 0 );
		vectorResult.setValue( 0 );

		GEM< Real, Device, Index > gem( matrixComp, vectorComp );
		@@ -68,85 +69,56 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve

		start = std::clock();

		if( verbose > 1 )
		cout << "starting computation number " << i+1 << endl;
		gem.solve( vectorResult, pivoting, 0 );
		gem.solve( vectorResult, pivoting, verbose );

		duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC;
		time[i] = duration;

		if( vectorName == "none" )
		{
		double l2norm = 0;
		for( int j = 0; j < matrix.getNumRows(); j++ )
		l2norm += (vectorResult.getElement( i ) - 1)*(vectorResult.getElement( i ) - 1);
		for( int j = 0; j < vectorResult.getSize(); j++ )
		l2norm += (vectorResult.getElement( j ) - 1)*(vectorResult.getElement( j ) - 1);
		l2norm = std::sqrt(l2norm);
		printf( " %.4f ", l2norm);
		error = l2norm;

		if( verbose > 1 )
		printf( "Norm in %d calculation is %.4f \n", i+1, l2norm);
		}
		}
		if( verbose > 1 )
		printf("\n ... done!\n");

		delete []time;

		return vectorResult;

		/*
		//#ifdef HAVE_CUDA




		#ifdef COMPARE_RESULTS
		double error = 0.0;
		//std::cout << "Results:\nrow:"<< setw(20) << "Host" << setw(20) << "Device" << setw(20) << "Error" << std::endl;
		for( int i = 0; i < matrix.getNumRows(); i++ )
		{
		double errorPom = ( result_vector.getElement(i) - result_vector_dev.getElement(i) ) *
		( result_vector.getElement(i) - result_vector_dev.getElement(i) );
		//std::cout << i << ": " << setw(20) << result_vector.getElement(i) << setw(20) << result_vector_dev.getElement(i) << setw(20) << errorPom << std::endl;
		error += errorPom;
		}
		double timeMean = 0;
		for( int i = 0; i < loops; i++ )
		timeMean += time[i];
		timeMean /= loops;

		error = std::sqrt(error);
		printf("Difference in L2 norm from Device and Host is %.8f\n", error );
		printf("%20s %15d %15d %20s %15s %15s %10d %15.5f %15.5f\n", matrixName.c_str(),
		matrix.getNumRows(), matrix.getNumNonzeros(), vectorName == "none" ? "-":vectorName.c_str(),
		device.c_str(), typeid(Real).name() == (string)"f" ? "float":"double", loops, timeMean, error);

		double CPUmean(0), GPUmean(0);
		cout << "Timers: " << endl << "CPU: [ ";
		for( int i = 0; i < loops; i++ )
		{
		CPUmean += timeCPU[i];
		GPUmean += timeGPU[i];
		cout << timeCPU[i] << " ";
		}
		CPUmean = CPUmean/loops;
		GPUmean = GPUmean/loops;
		delete []time;

		cout << "]" << endl;
		cout << "GPU: [ ";
		for( int i = 0; i < loops; i++ )
		cout << timeGPU[i] << " ";
		cout << "]" << endl;
		cout << "CPU mean time: " << CPUmean << endl;
		cout << "GPU mean time: " << GPUmean << endl;
		#endif

		#endif

		delete []timeCPU;
		#ifdef HAVE_CUDA
		delete []timeGPU;
		#endif
		return;*/
		return vectorResult;
		}

		template < typename Real, typename Index, typename Device >
		void readMatrixVector( Matrix< Real, Device, Index>& matrix,
		Vector< Real, Device, Index >& vector,
		const String& matrixName, const String& vectorName )
		const String& matrixName, const String& vectorName, const int verbose )
		{
		typedef Matrix< Real, Devices::Host, Index> MatrixHost;
		MatrixHost matrixHost;
		// Get matrix
		Matrices::MatrixReader< MatrixHost > m;
		m.readMtxFile( "./test-matrices/" + matrixName, matrixHost );
		if( verbose > 1 )
		cout << "reading matrix " << matrixName << endl;
		//matrixHost.showMatrix();
		if( verbose > 2 )
		matrixHost.showMatrix();
		matrix = matrixHost;

		// Get vector
		@@ -155,8 +127,13 @@ void readMatrixVector( Matrix< Real, Device, Index>& matrix,
		if( vectorName == "none" )
		calculHostVecOne( matrixHost, vectorHost, vectorName );
		else
		{
		readVector( vectorHost, vectorName );
		if( verbose > 1 )
		cout << "reading vector " << vectorName << endl;
		}
		if( verbose > 2 )
		cout << vectorHost << endl;
		vector = vectorHost;

		}
		@@ -174,9 +151,8 @@ void calculHostVecOne( Matrix< Real, Devices::Host, Index >& matrix,
		}
		vector[i] = pom;
		}
		cout << endl;

		ofstream outdata; // outdata is like cin
		/*ofstream outdata; // outdata is like cin

		outdata.open("./test-matrices/" + vectorName ); // opens the file
		if( !outdata ) { // file couldn't be opened
		@@ -189,7 +165,7 @@ void calculHostVecOne( Matrix< Real, Devices::Host, Index >& matrix,
		outdata << vector[i] << endl;
		}
		outdata.close();
		cout << endl;
		cout << endl;*/
		}

		template < typename Real, typename Index >

src/Benchmarks/GEM/gem/GEM_impl.h

+13 −12

Original line number	Diff line number	Diff line
		@@ -29,7 +29,6 @@ template< typename Real,
		typename Index >
		bool GEM< Real, Device, Index >::solve( Array& x, const TNL::String& pivoting, int verbose )
		{
		printf("in solve\n");
		return DeviceDependentCode::SolveGEM( *this, x, pivoting, verbose );
		}

		@@ -43,7 +42,7 @@ bool GEM< Real, Device, Index >::solveWithoutPivoting( Array& x, int verbose )

		const int n = this->A.getNumRows();

		if( verbose > 1 )
		if( verbose > 2 )
		this->print();

		for( int k = 0; k < n; k++ )
		@@ -65,7 +64,7 @@ bool GEM< Real, Device, Index >::solveWithoutPivoting( Array& x, int verbose )
		this->A.setElement( k, j, this->A.getElement( k, j )/pivot );
		this->A.setElement( k, k, 1.0 );

		if( verbose > 1 )
		if( verbose > 2 )
		{
		std::cout << "Dividing by the pivot ... " << std::endl;
		this->print();
		@@ -82,7 +81,7 @@ bool GEM< Real, Device, Index >::solveWithoutPivoting( Array& x, int verbose )
		this->A.setElement( i, k, 0.0 );
		}

		if( verbose > 1 )
		if( verbose > 2 )
		{
		std::cout << "Subtracting the " << k << "-th row from the rows bellow ... " << std::endl;
		this->print();
		@@ -112,13 +111,13 @@ bool GEM< Real, Device, Index >::solveWithPivoting( Array& x, int verbose )

		const int n = this->A.getNumRows();

		if( verbose > 1 )
		if( verbose > 2 )
		this->print();

		for( int k = 0; k < n; k++ )
		{
		if( verbose > 1 )
		std::cout << "Step " << k << "/" << n << "....\n"; //"\r";
		std::cout << "Step " << k << "/" << n << "....\n";
		/****
		* Find the pivot - the largest in k-th row
		*/
		@@ -142,11 +141,11 @@ bool GEM< Real, Device, Index >::solveWithPivoting( Array& x, int verbose )
		b[ pivotPosition ] = pom;
		}

		if( verbose > 0 )
		if( verbose > 1 )
		{
		std::cout << std::endl;
		std::cout << "Choosing element at " << pivotPosition << "-th row as pivot..." << std::endl;
		std::cout << "Swaping " << k << "-th and " << pivotPosition << "-th rows ... " << std::endl;
		std::cout << "Swapping " << k << "-th and " << pivotPosition << "-th rows ... " << std::endl;
		}

		/****
		@@ -163,7 +162,7 @@ bool GEM< Real, Device, Index >::solveWithPivoting( Array& x, int verbose )
		this->A.setElement( k, j, this->A.getElement( k, j)/pivot );
		this->A.setElement( k, k, 1.0 );

		if( verbose > 1 )
		if( verbose > 2 )
		{
		std::cout << "Dividing by the pivot ... " << std::endl;
		this->print();
		@@ -180,7 +179,7 @@ bool GEM< Real, Device, Index >::solveWithPivoting( Array& x, int verbose )
		this->A.setElement( i, k , 0.0 );
		}

		if( verbose > 1 )
		if( verbose > 2 )
		{
		std::cout << "Subtracting the " << k << "-th row from the rows bellow ... " << std::endl;
		this->print();
		@@ -278,7 +277,8 @@ class GEMDeviceDependentCode< TNL::Devices::Host >
		static bool SolveGEM( GEM< Real, DeviceType, Index>& gem,
		TNL::Containers::Vector< Real, DeviceType, Index >& x, const TNL::String& pivoting, int verbose )
		{
		printf("starting computation on CPU\n");
		if( verbose > 1 )
		printf("Starting the computation SolveGEM.\n");
		return pivoting == "yes" ? gem.solveWithPivoting( x, verbose ) : gem.solveWithoutPivoting(x, verbose );
		}
		};
		@@ -295,7 +295,8 @@ class GEMDeviceDependentCode< TNL::Devices::Cuda >
		TNL::Containers::Vector< Real, DeviceType, Index >& x, const TNL::String& pivoting, int verbose )
		{
		#ifdef HAVE_CUDA
		printf("starting computation on GPU\n");
		if( verbose > 1 )
		printf("Starting the computation SolveGEM.\n");
		gem.GEMdevice( x, pivoting, verbose );
		#endif
		return true;

src/Benchmarks/GEM/gem/GEMdevice.h

+50 −56

Original line number	Diff line number	Diff line
		@@ -8,7 +8,7 @@

		template < typename Real,
		typename Index >
		void calculateResultSeqCPU( Matrix< Real, TNL::Devices::Cuda, Index >& matrix,
		void calculResultVector( Matrix< Real, TNL::Devices::Cuda, Index >& matrix,
		TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& device_vector,
		TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& x )
		{
		@@ -30,20 +30,32 @@ void calculateResultSeqCPU( Matrix< Real, TNL::Devices::Cuda, Index >& matrix,
		template < typename Real,
		typename Device,
		typename Index >
		bool GEM<Real, Device, Index >::GEMdevice( /*Matrix< Real, TNL::Devices::Cuda, Index >& this->A,
		TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& device_vector, */
		Array& x, const TNL::String& pivoting, int verbose )
		bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting, int verbose )
		{
		Matrix< Real, TNL::Devices::Cuda, Index >* devMat = TNL::Cuda::passToDevice( this->A );
		TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& device_vector( this->b );

		// FOR PIVOTING SET VARIABLES ON DEVICE
		size_t size = sizeof(int);
		int* pivot; cudaMalloc(&pivot, size);
		int* pom = (int)malloc(size); pom = -1;

		if( verbose > 2 )
		{
		showMatrix<<< 1, 1 >>>( this->A );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		printf("\n");
		}


		for( int colPointer = 0; colPointer < this->A.getNumColumns(); colPointer++ )
		{
		if( verbose > 1 )
		printf( "Elimination: %d/%d\n", colPointer, this->A.getNumColumns() );

		if( pivoting == "yes" )
		{
		// PIVOTING
		int reduceBlockSize = (this->A.getNumColumns()-colPointer) > 512 ? 512 :
		TNL::roundToMultiple( this->A.getNumColumns()-colPointer, 32 );
		@@ -61,59 +73,41 @@ bool GEM<Real, Device, Index >::GEMdevice( /*Matrix< Real, TNL::Devices::Cuda, I
		findRowPivot<<< 1, reduceGridSizeRound >>>( outMax.getView(), outPos.getView(), pivot );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;

		int* pom = (int)malloc(size); pom = 0;
		*pom = 0;
		cudaMemcpy( pom, pivot, size, cudaMemcpyDeviceToHost);
		//printf("%d: position of pivot: %d\n", colPointer, *pom);
		}


		int blockSize = (this->A.getNumColumns()-colPointer) > 1024 ? 1024 : this->A.getNumColumns()-colPointer;
		int numBlocksOnRow = TNL::roundUpDivision( (this->A.getNumColumns()-colPointer), 1024 );
		int numOfBlocks = this->A.getNumRows() * numBlocksOnRow;
		//printf( "%d number of threads, %d number of blocks\n", blockSize, numOfBlocks);

		if( pom != -1 && pom != colPointer )

		if( pivoting == "yes" && pom != -1 && pom != colPointer )
		{
		if( verbose > 1 )
		{
		std::cout << std::endl;
		std::cout << "Choosing element at " << *pom << "-th row as pivot..." << std::endl;
		std::cout << "Swapping " << colPointer << "-th and " << *pom << "-th rows ... " << std::endl;
		}
		swapRows<<< numBlocksOnRow, blockSize >>>( devMat, device_vector.getView(), colPointer, numBlocksOnRow, pivot );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		}

		/*printf("\n");
		showMatrix<<< 1, 1 >>>( this->A );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		printf("\n");*/

		GEMColumnUnderDiag<<< numOfBlocks, blockSize >>>( devMat,
		GEMmainKernel<<< numOfBlocks, blockSize >>>( devMat,
		device_vector.getView(),
		colPointer,
		numBlocksOnRow );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;



		/*int blockSize1 = this->A.getNumRows() > 1024 ? 1024 : this->A.getNumRows();
		int gridSize1 = TNL::roundUpDivision( this->A.getNumRows(), blockSize1 );
		GEMZeroingMainColumn<<< gridSize1, blockSize1 >>>(devMat,
		device_vector.getView(),
		colPointer );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;

		printf("\n");
		showMatrix<<< 1, 1 >>>( this->A );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		printf("\n");
		std::cout << device_vector << std::endl;*/
		}

		cudaFree(pivot);
		//std::cout << device_vector << std::endl;
		free(pom);

		calculateResultSeqCPU( this->A, device_vector, x );
		calculResultVector( this->A, device_vector, x );

		return true;
		}

src/Benchmarks/GEM/gem/GEMkernels.h

+1 −23

Original line number	Diff line number	Diff line
		@@ -6,22 +6,18 @@

		template <typename Real >
		__inline__ __device__ void warpReduceArgMax(Real& val, int& index) {
		//printf( "index = %d\n", index );
		__syncthreads();
		for (int offset = 32/2; offset > 0; offset /= 2)
		{
		Real val1 = __shfl_down_sync( 0xffffffff, val, offset, 32);
		int index1 = __shfl_down_sync( 0xffffffff, index, offset, 32);
		__syncthreads();
		//printf("%d: firstElementInRow = %.4f, %d: val1 = %.4f\n", index, val, index1, val1 );
		if( TNL::abs( val1 ) - TNL::abs( val ) > 0 )
		{
		val = val1;
		index = index1;
		//printf("%d: %.4f\n", index, firstElementInRow );
		}
		__syncthreads();
		//printf("%d: firstElementInRow = %.4f\n", index, val );
		}
		}

		@@ -150,32 +146,15 @@ void swapRows( Matrix< Real, TNL::Devices::Cuda, int >* A,
		}
		}

		/*template <typename Real >
		__global__
		void GEMZeroingMainColumn( Matrix< Real, TNL::Devices::Cuda, int >* A,
		TNL::Containers::VectorView< Real, TNL::Devices::Cuda, int > b,
		int colPointerMain )
		{
		//int rowPointer = threadIdx.x + blockDim.x * blockIdx.x;
		//if( rowPointer < A->getNumRows() && rowPointer != colPointerMain )
		// A->setElement( rowPointer, colPointerMain, 0.0 );
		A->setElement( colPointerMain, colPointer, A->getElement( colPointerMain, colPointer ) / A->getElement( colPointerMain, colPointerMain ) );
		if( colPointer == colPointerMain+1 )
		b[ colPointerMain ] /= A->getElement( colPointerMain, colPointerMain );
		}*/



		template <typename Real >
		__global__
		void GEMColumnUnderDiag( Matrix< Real, TNL::Devices::Cuda, int >* A,
		void GEMmainKernel( Matrix< Real, TNL::Devices::Cuda, int >* A,
		TNL::Containers::VectorView< Real, TNL::Devices::Cuda, int > b,
		int colPointerMain, int numBlocksOnRow )
		{
		int rowPointer = blockIdx.x / numBlocksOnRow;
		int colPointer = threadIdx.x + blockDim.x * (blockIdx.x % numBlocksOnRow) + colPointerMain;
		/*if( rowPointer == colPointerMain && colPointer == colPointerMain )
		A->setElement( colPointerMain, colPointerMain, 1 );*/
		if( colPointer > colPointerMain && colPointer < A->getNumColumns() && rowPointer != colPointerMain && rowPointer < A->getNumRows() )
		{
		if( A->getElement( colPointerMain, colPointerMain ) != 0 )
		@@ -192,7 +171,6 @@ void GEMColumnUnderDiag( Matrix< Real, TNL::Devices::Cuda, int >* A,
		if( rowPointer != colPointerMain && threadIdx.x == 0 && blockIdx.x % numBlocksOnRow == 0 && A->getElement( colPointerMain, colPointerMain ) != 0 && A->getElement( rowPointer, colPointerMain ) != 0 )
		{
		b[ rowPointer ] = b[ rowPointer ] - A->getElement( rowPointer, colPointerMain ) * b[ colPointerMain ] / A->getElement( colPointerMain, colPointerMain );
		//A->setElement( rowPointer, colPointerMain, 0.0 );
		}
		}