Added matrices for tests. Remade calling of computation as DDC. Made calling from terminal. (c4ea70a8) · Commits · TNL / tnl-dev

src/Benchmarks/GEM/Matrix/Matrix_impl.h

+1 −1

Original line number	Diff line number	Diff line
		@@ -161,7 +161,7 @@ Matrix< Real, Device, Index >::operator=( Matrix< Real, Device2, Index>& matrix
		matrix.getData( pom );
		if( std::is_same< Device, Device2 >::value )
		{
		printf("copy host to host or device to device\n");
		//printf("copy host to host or device to device\n");
		this->data = pom;
		}
		#ifdef HAVE_CUDA

src/Benchmarks/GEM/gem.h

0 → 100644

+212 −0

Original line number	Diff line number	Diff line

		#include "Matrix/Matrix.h"
		#include "gem/GEM.h"
		#include "TNL/tnl-dev/src/TNL/Math.h"
		//#include "gem/GEMdevice.h"
		#include <TNL/Matrices/MatrixReader.h>

		#include "TNL/tnl-dev/src/TNL/Cuda/CheckDevice.h"
		#include "TNL/tnl-dev/src/TNL/Devices/Cuda.h"
		#include "TNL/tnl-dev/src/TNL/Containers/Vector.h"
		#include "TNL/tnl-dev/src/TNL/Cuda/MemoryHelpers.h"
		#include "TNL/tnl-dev/src/TNL/Devices/Host.h"

		#define COMPARE_RESULTS true;

		// Prihodit GEMdeice do GEM a dělat dělení na device až tam -> přidat parametr pivoting

		using namespace TNL;
		using namespace TNL::Containers;
		using namespace std;


		template < typename Real, typename Index >
		void calculHostVecOne( Matrix< Real, Devices::Host, Index >& matrix,
		Vector< Real, Devices::Host, Index >& vector, const String& vectorName );

		template < typename Real, typename Index >
		void readVector( Vector< Real, Devices::Host, Index >& vector_host, const String& vectorName );


		template < typename Real, typename Index, typename Device >
		void readMatrixVector( Matrix< Real, Device, Index>& matrix,
		Vector< Real, Device, Index >& vector,
		const String& matrixName, const String& vectorName );


		template < typename Real,
		typename Index,
		typename Device >
		Vector< Real, Device, Index > runGEM( const String& matrixName, const String& vectorName, const int loops,
		const int verbose, const String& device, const String& pivoting )
		{
		typedef Matrix< Real, Device, Index > MatrixType;
		typedef Vector< Real, Device, Index > VectorType;

		MatrixType matrix;
		VectorType vector;

		readMatrixVector( matrix, vector, matrixName, vectorName );
		VectorType vectorResult( matrix.getNumRows() );

		// Computation
		double* time;
		time = new double[ loops ];

		std::cout << "Starting computation on " << device << endl;
		for( int i = 0; i < loops; i++ )
		{
		MatrixType matrixComp = matrix;
		VectorType vectorComp( vector );
		vectorComp.setValue( 0 );
		vectorResult.setValue( 0 );

		GEM< Real, Device, Index > gem( matrixComp, vectorComp );

		std::clock_t start;
		double duration;

		start = std::clock();

		cout << "starting computation number " << i+1 << endl;
		gem.solve( vectorResult, pivoting, 0 );

		duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC;
		time[i] = duration;

		double l2norm = 0;
		for( int j = 0; j < matrix.getNumRows(); j++ )
		l2norm += (vectorResult.getElement( i ) - 1)*(vectorResult.getElement( i ) - 1);
		l2norm = std::sqrt(l2norm);
		printf( " %.4f ", l2norm);
		}
		printf("\n ... done!\n");

		delete []time;

		return vectorResult;

		/*
		//#ifdef HAVE_CUDA




		#ifdef COMPARE_RESULTS
		double error = 0.0;
		//std::cout << "Results:\nrow:"<< setw(20) << "Host" << setw(20) << "Device" << setw(20) << "Error" << std::endl;
		for( int i = 0; i < matrix.getNumRows(); i++ )
		{
		double errorPom = ( result_vector.getElement(i) - result_vector_dev.getElement(i) ) *
		( result_vector.getElement(i) - result_vector_dev.getElement(i) );
		//std::cout << i << ": " << setw(20) << result_vector.getElement(i) << setw(20) << result_vector_dev.getElement(i) << setw(20) << errorPom << std::endl;
		error += errorPom;
		}

		error = std::sqrt(error);
		printf("Difference in L2 norm from Device and Host is %.8f\n", error );

		double CPUmean(0), GPUmean(0);
		cout << "Timers: " << endl << "CPU: [ ";
		for( int i = 0; i < loops; i++ )
		{
		CPUmean += timeCPU[i];
		GPUmean += timeGPU[i];
		cout << timeCPU[i] << " ";
		}
		CPUmean = CPUmean/loops;
		GPUmean = GPUmean/loops;

		cout << "]" << endl;
		cout << "GPU: [ ";
		for( int i = 0; i < loops; i++ )
		cout << timeGPU[i] << " ";
		cout << "]" << endl;
		cout << "CPU mean time: " << CPUmean << endl;
		cout << "GPU mean time: " << GPUmean << endl;
		#endif

		#endif

		delete []timeCPU;
		#ifdef HAVE_CUDA
		delete []timeGPU;
		#endif
		return;*/
		}

		template < typename Real, typename Index, typename Device >
		void readMatrixVector( Matrix< Real, Device, Index>& matrix,
		Vector< Real, Device, Index >& vector,
		const String& matrixName, const String& vectorName )
		{
		typedef Matrix< Real, Devices::Host, Index> MatrixHost;
		MatrixHost matrixHost;
		// Get matrix
		Matrices::MatrixReader< MatrixHost > m;
		m.readMtxFile( "./test-matrices/" + matrixName, matrixHost );
		cout << "reading matrix " << matrixName << endl;
		//matrixHost.showMatrix();
		matrix = matrixHost;

		// Get vector
		Vector< Real, Devices::Host, Index > vectorHost( matrixHost.getNumRows() );

		if( vectorName == "none" )
		calculHostVecOne( matrixHost, vectorHost, vectorName );
		else
		readVector( vectorHost, vectorName );
		cout << "reading vector " << vectorName << endl;
		vector = vectorHost;

		}

		template < typename Real, typename Index >
		void calculHostVecOne( Matrix< Real, Devices::Host, Index >& matrix,
		Vector< Real, Devices::Host, Index >& vector, const String& vectorName )
		{
		for( int i = 0; i < matrix.getNumRows(); i++ )
		{
		Real pom = 0;
		for( int j = 0; j < matrix.getNumColumns(); j++ )
		{
		pom += matrix.getElement(i,j);
		}
		vector[i] = pom;
		}
		cout << endl;

		ofstream outdata; // outdata is like cin

		outdata.open("./test-matrices/" + vectorName ); // opens the file
		if( !outdata ) { // file couldn't be opened
		cerr << "Error: file could not be opened" << endl;
		exit(1);
		}

		for( int i = 0; i < vector.getSize(); i++ )
		{
		outdata << vector[i] << endl;
		}
		outdata.close();
		cout << endl;
		}

		template < typename Real, typename Index >
		void readVector( Vector< Real, Devices::Host, Index >& vector_host, const String& vectorName )
		{
		ifstream inFile;
		Real x;
		inFile.open("./test-matrices/" + vectorName );
		if (!inFile) {
		cout << "Unable to open file" << endl;
		return;
		}

		int i = 0;
		while ( inFile >> x ) {
		vector_host[i] = x;
		i++;
		}
		inFile.close();
		}
		No newline at end of file

src/Benchmarks/GEM/gem-CPU/GEM.h→src/Benchmarks/GEM/gem/GEM.h

+52 −0

Original line number	Diff line number	Diff line
		@@ -9,26 +9,33 @@

		#include <ostream>

		template < typename Real = double,
		typename Device = TNL::Devices::Host,
		typename Index = int >
		template < typename Device >
		class GEMDeviceDependentCode;

		template < typename Real,
		typename Device,
		typename Index >
		class GEM
		{
		public:

		typedef Device DeviceType;
		typedef Matrix< Real, Device, Index > MatrixGEM;
		typedef TNL::Containers::Array< Real, Device, Index > Array;
		typedef TNL::Containers::Vector< Real, Device, Index > Array;


		GEM( MatrixGEM& A,
		Array& b );

		bool solve( Array& x, int verbose = 0 );
		bool solve( Array& x, const TNL::String& pivoting, int verbose = 0 );

		bool solveWithoutPivoting( Array& x, int verbose = 0 );

		bool solveWithPivoting( Array& x, int verbose = 0 );

		bool computeLUDecomposition( int verbose = 0 );

		bool GEMdevice( Array& x, const TNL::String& pivoting, int verbose );

		protected:

		void print( std::ostream& str = std::cout ) const;
		@@ -36,6 +43,9 @@ class GEM
		MatrixGEM A;

		Array b;

		typedef GEMDeviceDependentCode< DeviceType > DeviceDependentCode;
		friend class GEMDeviceDependentCode< DeviceType >;
		};

		#include "GEM_impl.h"

src/Benchmarks/GEM/gem-CPU/GEM_impl.h→src/Benchmarks/GEM/gem/GEM_impl.h

+44 −29

Original line number	Diff line number	Diff line
		@@ -13,6 +13,8 @@
		#pragma once

		#include <TNL/Assert.h>
		#include "GEMdevice.h"
		#include "GEM.h"
		#include <fstream>

		template< typename Real,
		@@ -22,11 +24,20 @@ GEM< Real, Device, Index >::GEM( MatrixGEM& A, Array& b )
		: A(A), b(b)
		{}

		template< typename Real,
		typename Device,
		typename Index >
		bool GEM< Real, Device, Index >::solve( Array& x, const TNL::String& pivoting, int verbose )
		{
		printf("in solve\n");
		return DeviceDependentCode::SolveGEM( *this, x, pivoting, verbose );
		}


		template< typename Real,
		typename Device,
		typename Index >
		bool GEM< Real, Device, Index >::solve( Array& x, int verbose )
		bool GEM< Real, Device, Index >::solveWithoutPivoting( Array& x, int verbose )
		{
		assert( b.getSize() == x.getSize() );

		@@ -40,6 +51,7 @@ bool GEM< Real, Device, Index >::solve( Array& x, int verbose )
		/****
		* Divide the k-th row by pivot
		*/
		if( verbose > 1 )
		if( k % 10 == 0 )
		std::cout << "Elimination: " << k << "/" << n << std::endl;
		const Real& pivot = this->A.getElement( k, k );
		@@ -106,7 +118,7 @@ bool GEM< Real, Device, Index >::solveWithPivoting( Array& x, int verbose )
		for( int k = 0; k < n; k++ )
		{
		if( verbose > 1 )
		std::cout << "Step " << k << "/" << n << ".... \r";
		std::cout << "Step " << k << "/" << n << "....\n"; //"\r";
		/****
		* Find the pivot - the largest in k-th row
		*/
		@@ -255,35 +267,38 @@ void GEM< Real, Device, Index >::print( std::ostream& str ) const
		str << " \| " << std::setprecision( precision ) << std::setw( precision + 6 ) << b[ row ] << " \|" << std::endl;
		}
		}
		#ifdef HAVE_CUDA
		/*template < typename Real,
		typename Index >*/
		__global__ void GEMKernel( /*Matrix< Real, Devices::Cuda, Index >& matrix,
		Containers::Array< Real, Devices::Cuda, Index >& b,
		Containers::Array< Real, Devices::Cuda, Index >& x,
		int verbose*/ )

		template <>
		class GEMDeviceDependentCode< TNL::Devices::Host >
		{
		public:
		typedef TNL::Devices::Host DeviceType;

		template < typename Real, typename Index>
		static bool SolveGEM( GEM< Real, DeviceType, Index>& gem,
		TNL::Containers::Vector< Real, DeviceType, Index >& x, const TNL::String& pivoting, int verbose )
		{
		unsigned int i = threadIdx.x;// + blockDim.x * blockIdx.x;
		printf( "%d\n" , i );
		printf("starting computation on CPU\n");
		return pivoting == "yes" ? gem.solveWithPivoting( x, verbose ) : gem.solveWithoutPivoting(x, verbose );
		}
		};

		#endif

		template <>
		class GEMDeviceDependentCode< TNL::Devices::Cuda >
		{
		public:
		typedef TNL::Devices::Cuda DeviceType;

		/*template< typename Real,
		typename Device,
		typename Index >
		bool Matrix< Real, Device, Index >::solveCuda( Array& b, Array& x, int verbose )
		template < typename Real, typename Index>
		static bool SolveGEM( GEM< Real, DeviceType, Index>& gem,
		TNL::Containers::Vector< Real, DeviceType, Index >& x, const TNL::String& pivoting, int verbose )
		{
		#ifdef HAVE_CUDA
		Index numRows = this->A.getNumRows();
		Index numColumns = this->A.getNumColumns();
		Index numThreads = 32;
		Index numBlocks = roundUpDivision( numRows, numThreads );
		// TODO: num of blocks needed higher than num of blocks able to calculate
		Devices::Cuda::synchronizeDevice();
		GEMKernel<<< 1, 1 >>>( this, b, x, verbose );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		printf("starting computation on GPU\n");
		gem.GEMdevice( x, pivoting, verbose );
		#endif
		}*/
		return true;
		}
		};

src/Benchmarks/GEM/gem-GPU/GEMdevice.h→src/Benchmarks/GEM/gem/GEMdevice.h

+44 −53

Original line number	Diff line number	Diff line
		#include "GEMkernels.h"
		#include "TNL/Cuda/MemoryHelpers.h"
		#define DEBUG 0



		#ifdef HAVE_CUDA
		#include "GEMkernels.h"

		template < typename Real,
		typename Index >
		void calculateResultSeqCPU( Matrix< Real, TNL::Devices::Cuda, Index >& matrixDev,
		void calculateResultSeqCPU( Matrix< Real, TNL::Devices::Cuda, Index >& matrix,
		TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& device_vector,
		TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& result_vector_dev )
		TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& x )
		{
		Matrix< Real, TNL::Devices::Cuda, Index >* devMat = TNL::Cuda::passToDevice( matrixDev);
		Matrix< Real, TNL::Devices::Cuda, Index >* devMat = TNL::Cuda::passToDevice( matrix);

		int blockSize = matrixDev.getNumRows() > 1024 ? 1024 : matrixDev.getNumColumns();
		int numBlocksOnColumn = TNL::roundUpDivision( matrixDev.getNumRows(), 1024 );
		int numOfBlocks = matrixDev.getNumRows() * numBlocksOnColumn;
		int blockSize = matrix.getNumRows() > 1024 ? 1024 : matrix.getNumColumns();
		int numBlocksOnColumn = TNL::roundUpDivision( matrix.getNumRows(), 1024 );
		int numOfBlocks = matrix.getNumRows() * numBlocksOnColumn;


		GEMDiagToResult<<< numOfBlocks, blockSize >>>( devMat,device_vector.getView(),result_vector_dev.getView() );
		GEMDiagToResult<<< numOfBlocks, blockSize >>>( devMat,device_vector.getView(), x.getView() );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		}
		@@ -27,28 +27,27 @@ void calculateResultSeqCPU( Matrix< Real, TNL::Devices::Cuda, Index >& matrixDev







		template < typename Real,
		typename Device,
		typename Index >
		void GEMdevice( Matrix< Real, TNL::Devices::Cuda, Index >& matrixDev,
		TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& device_vector,
		TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& result_vector_dev )
		bool GEM<Real, Device, Index >::GEMdevice( /*Matrix< Real, TNL::Devices::Cuda, Index >& this->A,
		TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& device_vector, */
		Array& x, const TNL::String& pivoting, int verbose )
		{
		Matrix< Real, TNL::Devices::Cuda, Index >* devMat = TNL::Cuda::passToDevice( matrixDev);

		Matrix< Real, TNL::Devices::Cuda, Index >* devMat = TNL::Cuda::passToDevice( this->A );
		TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& device_vector( this->b );
		// FOR PIVOTING SET VARIABLES ON DEVICE
		size_t size = sizeof(int);
		int* pivot; cudaMalloc(&pivot, size);


		for( int colPointer = 0; colPointer < matrixDev.getNumColumns(); colPointer++ )
		for( int colPointer = 0; colPointer < this->A.getNumColumns(); colPointer++ )
		{
		int reduceBlockSize = (matrixDev.getNumColumns()-colPointer) > 512 ? 512 :
		TNL::roundToMultiple( matrixDev.getNumColumns()-colPointer, 32 );
		int reduceGridSize = TNL::roundUpDivision( matrixDev.getNumColumns()-colPointer, reduceBlockSize );

		// PIVOTING
		int reduceBlockSize = (this->A.getNumColumns()-colPointer) > 512 ? 512 :
		TNL::roundToMultiple( this->A.getNumColumns()-colPointer, 32 );
		int reduceGridSize = TNL::roundUpDivision( this->A.getNumColumns()-colPointer, reduceBlockSize );
		int reduceGridSizeRound = TNL::roundToMultiple( reduceGridSize, 32 );

		TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > outMax(reduceGridSizeRound);
		@@ -68,63 +67,55 @@ void GEMdevice( Matrix< Real, TNL::Devices::Cuda, Index >& matrixDev,
		//printf("%d: position of pivot: %d\n", colPointer, *pom);


		int blockSize = (matrixDev.getNumColumns()-colPointer) > 1024 ? 1024 : matrixDev.getNumColumns();
		int numBlocksOnRow = TNL::roundUpDivision( (matrixDev.getNumColumns()-colPointer), 1024 );
		int numOfBlocks = matrixDev.getNumRows() * numBlocksOnRow;
		int blockSize = (this->A.getNumColumns()-colPointer) > 1024 ? 1024 : this->A.getNumColumns()-colPointer;
		int numBlocksOnRow = TNL::roundUpDivision( (this->A.getNumColumns()-colPointer), 1024 );
		int numOfBlocks = this->A.getNumRows() * numBlocksOnRow;
		//printf( "%d number of threads, %d number of blocks\n", blockSize, numOfBlocks);

		if( pom != -1 && pom != colPointer )
		{
		swapRows<<< numBlocksOnRow, blockSize >>>( devMat, device_vector.getView(), colPointer, numBlocksOnRow, pivot );
		}

		/*showMatrix<<< 1, 1 >>>( matrixDev );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;*/
		TNL_CHECK_CUDA_DEVICE;
		}

		/int pom = (int)malloc(size); pom = 0;
		cudaMemcpy(pivot, pom, size, cudaMemcpyHostToDevice);
		findPivot<<< numBlocksOnRow, 1024 >>>( devMat, colPointer, numBlocksOnRow );
		/*printf("\n");
		showMatrix<<< 1, 1 >>>( this->A );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		printf("\n");*/

		findRowPivot<<< numBlocksOnRow, blockSize >>>( devMat, colPointer, numBlocksOnRow, d_max, pivot );
		GEMColumnUnderDiag<<< numOfBlocks, blockSize >>>( devMat,
		device_vector.getView(),
		colPointer,
		numBlocksOnRow );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;


		cudaMemcpy( pom, pivot, size, cudaMemcpyDeviceToHost);
		//printf("%d: position of pivot: %d\n", colPointer, *pom);
		if( pom != -1 && pom != colPointer )
		{
		swapRows<<< numBlocksOnRow, blockSize >>>( devMat, device_vector.getView(), colPointer, numBlocksOnRow, pivot );
		}*/


		/*cudaMemcpy( pom, d_max, size, cudaMemcpyDeviceToHost);
		printf("%d\n", pom );/



		GEMColumnUnderDiag<<< numOfBlocks, blockSize >>>( devMat,
		/*int blockSize1 = this->A.getNumRows() > 1024 ? 1024 : this->A.getNumRows();
		int gridSize1 = TNL::roundUpDivision( this->A.getNumRows(), blockSize1 );
		GEMZeroingMainColumn<<< gridSize1, blockSize1 >>>(devMat,
		device_vector.getView(),
		colPointer,
		numBlocksOnRow );
		colPointer );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		/*printf("\n");
		showMatrix<<< 1, 1 >>>( matrixDev );

		printf("\n");
		showMatrix<<< 1, 1 >>>( this->A );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		printf("\n");*/
		printf("\n");
		std::cout << device_vector << std::endl;*/
		}

		cudaFree(pivot);
		//std::cout << device_vector << std::endl;

		calculateResultSeqCPU( matrixDev, device_vector, result_vector_dev );

		calculateResultSeqCPU( this->A, device_vector, x );

		return true;
		}

		#endif // HAVE_CUDA
		No newline at end of file