GEM can now calculate with multiple grid (matrix size 8000 rows +), better... (f6e34a44) · Commits · TNL / tnl-dev

src/Benchmarks/GEM/Makefile.inc

+2 −2

Original line number	Diff line number	Diff line
		@@ -6,8 +6,8 @@ TNL_HEADERS = ${HOME}/.local/include
		INSTALL_DIR = ${HOME}/.local
		WITH_CUDA = yes
		WITH_MPI = yes
		WITH_OPENMP = yes
		WITH_DEBUG = yes
		WITH_OPENMP = no
		WITH_DEBUG = no
		MPI_FLAGT =
		MPI_FLAGS =
		#MPI_FLAGT =-L/home/maty/.openmpi/lib -lmpi

src/Benchmarks/GEM/Matrix/Matrix.h

+8 −0

Original line number	Diff line number	Diff line
		@@ -20,6 +20,7 @@ class Matrix
		typedef Real RealType;
		typedef TNL::Containers::Vector< Index, Device, Index > CompressedRowLengthsVector;
		typedef TNL::Containers::Vector< Real, Device, Index > Vector;
		typedef TNL::Containers::Vector< Real, TNL::Devices::Host, Index > VectorHost;
		typedef TNL::Containers::VectorView< Real, Device, Index > VectorView;
		typedef TNL::Containers::VectorView< Real, TNL::Devices::Host, Index > HostVectorView;
		typedef TNL::Containers::VectorView< Real, TNL::Devices::Cuda, Index > DeviceVectorView;
		@@ -28,6 +29,13 @@ class Matrix
		Matrix( const Index rows, const Index columns );
		Matrix( Matrix< Real, Device, Index>& matrix );

		/**
		* Reset of matrix to 0 rows, 0 columns and 0 data memory.
		*
		* @return void
		*/
		void reset();

		/**
		* Sets dimension for matrix of rows x columns elements and allocates memory
		* for data array of this matrix.

src/Benchmarks/GEM/Matrix/Matrix_impl.h

+37 −7

Original line number	Diff line number	Diff line
		@@ -25,6 +25,18 @@ Matrix< Real, Device, Index >::Matrix( Matrix< Real, Device, Index>& matrix )
		matrix.getData( this->data );
		}

		template < typename Real,
		typename Device,
		typename Index >
		void Matrix< Real, Device, Index >::reset( )
		{
		this->rows = 1;
		this->columns = 1;
		this->data.reset();
		this->data.setSize( 1 );
		}



		template < typename Real,
		typename Device,
		@@ -40,8 +52,8 @@ void Matrix< Real, Device, Index >::setDimensions( const Index rows, const Index
		#if DEBUG
		printf("We are making host array!\n");
		#endif
		data.setSize( rows*columns );
		data.setValue( 0 );
		this->data.setSize( rows*columns );
		this->data.setValue( 0 );
		}
		#ifdef HAVE_CUDA
		if( std::is_same< Device, TNL::Devices::Cuda >::value )
		@@ -321,27 +333,45 @@ Matrix<Real, Device, Index >&
		Matrix< Real, Device, Index >::operator=( Matrix< Real, Device2, Index>& matrix )
		{
		this->setDimensions( matrix.getNumRows(), matrix.getNumColumns() );
		Vector pom;
		matrix.getData( pom );
		if( std::is_same< Device, Device2 >::value )
		{
		Vector pom;
		matrix.getData( pom );
		this->data = pom;
		}
		#ifdef HAVE_CUDA
		else if( std::is_same< Device, TNL::Devices::Host >::value )
		{
		VectorHost pom;
		matrix.getData( pom );

		VectorHost pom1;
		pom1.setSize( this->data.getSize() );

		#ifdef HAVE_OPENMP
		#pragma omp parallel for if( Devices::Host::isOMPEnabled() )
		#endif
		for( int i = 0; i < this->rows; i++ )
		for( int j = 0; j < this->columns; j++ )
		this->setElement(i,j, pom[ i*TNL::roundToMultiple( this->rows, TNL::Cuda::getWarpSize() ) + j ] );
		pom1[ i * this->columns + j ] = pom[ i*TNL::roundToMultiple( this->rows, TNL::Cuda::getWarpSize() ) + j ];
		this->data = pom1;
		}
		else if( std::is_same< Device, TNL::Devices::Cuda >::value )
		{
		VectorHost pom;
		matrix.getData( pom );

		VectorHost pom1;
		pom1.setSize( this->data.getSize() );
		#ifdef HAVE_OPENMP
		#pragma omp parallel for if( Devices::Host::isOMPEnabled() )
		#endif
		for( int i = 0; i < this->getNumRows(); i++ )
		for( int j = 0; j < this->getNumColumns(); j++ )
		{
		this->data.setElement( i*TNL::roundToMultiple( this->columns, TNL::Cuda::getWarpSize() ) + j,
		pom.getElement( i * this->columns + j ) );
		pom1[ iTNL::roundToMultiple( this->columns, TNL::Cuda::getWarpSize() ) + j ] = pom[ i this->columns + j];
		}
		this->data = pom1;
		}
		#endif // HAVE_CUDA
		return *this;

src/Benchmarks/GEM/gem.h

+50 −46

Original line number	Diff line number	Diff line
		@@ -3,18 +3,18 @@

		#include "Matrix/Matrix.h"
		#include "gem/GEM.h"
		#include "TNL/tnl-dev/src/TNL/Math.h"
		#include <TNL/Math.h>
		#include <typeinfo> // type printf
		#include <TNL/Matrices/MatrixReader.h>

		#include "TNL/tnl-dev/src/TNL/Cuda/CheckDevice.h"
		#include "TNL/tnl-dev/src/TNL/Devices/Cuda.h"
		#include "TNL/tnl-dev/src/TNL/Containers/Vector.h"
		#include "TNL/tnl-dev/src/TNL/Cuda/MemoryHelpers.h"
		#include "TNL/tnl-dev/src/TNL/Devices/Host.h"
		#include <TNL/Cuda/CheckDevice.h>
		#include <TNL/Devices/Cuda.h>
		#include <TNL/Containers/Vector.h>
		#include <TNL/Cuda/MemoryHelpers.h>
		#include <TNL/Devices/Host.h>

		#ifdef HAVE_MPI
		#include "TNL/tnl-dev/src/TNL/Communicators/MpiCommunicator.h"
		#include <TNL/Communicators/MpiCommunicator.h>
		#include <mpi.h>
		#endif

		@@ -40,9 +40,9 @@ template < typename Real, typename Index >
		void readVector( Vector< Real, Devices::Host, Index >& vector_host, const String& vectorName );


		template < typename Real, typename Index, typename Device >
		void readMatrixVector( Matrix< Real, Device, Index>& matrix,
		Vector< Real, Device, Index >& vector,
		template < typename Real, typename Index >
		void readMatrixVector( Matrix< Real, Devices::Host, Index>& matrix,
		Vector< Real, Devices::Host, Index >& vector,
		const String& matrixName, const String& vectorName,
		Index &rows, Index &nonzeros, const int verbose );

		@@ -55,17 +55,22 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve
		{
		typedef Matrix< Real, Device, Index > MatrixType;
		typedef Vector< Real, Device, Index > VectorType;
		typedef Matrix< Real, Devices::Host, Index > MatrixTypeHost;
		typedef Vector< Real, Devices::Host, Index > VectorTypeHost;

		int processID = 0; // MPI processID, without mpi == 0
		#ifdef HAVE_MPI
		MPI_Comm_rank( MPI_COMM_WORLD, &processID );
		Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD );
		#endif

		MatrixTypeHost matrixHost;
		VectorTypeHost vectorHost;
		MatrixType matrix;
		VectorType vector;
		VectorType vectorResult;

		Index rows, nonzeros;
		readMatrixVector( matrix, vector, matrixName, vectorName, rows, nonzeros, verbose );
		VectorType vectorResult( rows );
		readMatrixVector( matrixHost, vectorHost, matrixName, vectorName, rows, nonzeros, verbose );

		// Computation
		double* time;
		@@ -80,10 +85,14 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve
		#ifdef HAVE_MPI
		Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD );
		#endif
		MatrixType matrixComp = matrix;
		VectorType vectorComp( vector );
		//readMatrixVector( matrixHost, vectorHost, matrixName, vectorName, rows, nonzeros, verbose );
		matrix.reset();
		vector.reset();
		matrix = matrixHost;
		vector = vectorHost;
		vectorResult.setSize( rows );
		vectorResult.setValue( 0 );
		GEM< Real, Device, Index > gem( matrixComp, vectorComp );
		GEM< Real, Device, Index > gem( matrix, vector );

		#ifdef HAVE_MPI
		Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD );
		@@ -125,15 +134,16 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve
		timeMean /= loops;

		if( processID == 0 )
		printf("%20s %15d %15d %20s %15s %15s %10d %15.5f %15.5f\n", matrixName.c_str(),
		rows, nonzeros, vectorName == "none" ? "-":vectorName.c_str(),
		device.c_str(), typeid(Real).name() == (string)"f" ? "float":"double", loops, timeMean, error);
		printf("%20s %15s %15s %10d %20s & %15d & %15.3f & %15.3f\n", vectorName == "none" ? "-":vectorName.c_str(),
		device.c_str(), typeid(Real).name() == (string)"f" ? "float":"double", loops, matrixName.c_str(),
		rows, timeMean, error);

		delete []time;

		#ifdef HAVE_MPI
		Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD );
		#endif
		matrix.reset();
		if( processID == 0 ){
		//printf("%d: returning\n", processID );
		return vectorResult;
		@@ -146,47 +156,39 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve
		}
		}

		template < typename Real, typename Index, typename Device >
		void readMatrixVector( Matrix< Real, Device, Index>& matrix,
		Vector< Real, Device, Index >& vector,
		template < typename Real, typename Index >
		void readMatrixVector( Matrix< Real, Devices::Host, Index>& matrix,
		Vector< Real, Devices::Host, Index >& vector,
		const String& matrixName, const String& vectorName,
		Index &rows, Index &nonzeros, const int verbose )
		{
		typedef Matrix< Real, Devices::Host, Index> MatrixHost;
		MatrixHost matrixHost;
		if( verbose > 1 )
		cout << "reading matrix " << matrixName << endl;
		// Get matrix
		Matrices::MatrixReader< MatrixHost > m;
		m.readMtxFile( "./test-matrices/" + matrixName, matrixHost, verbose );
		rows = matrixHost.getNumRows();
		nonzeros = matrixHost.getNumNonzeros();

		Matrices::MatrixReader< Matrix< Real, Devices::Host, Index> > m;
		m.readMtxFile( "./test-matrices/" + matrixName, matrix, verbose );
		rows = matrix.getNumRows();
		nonzeros = matrix.getNumNonzeros();
		vector.setSize( rows );

		// Get vector
		Vector< Real, Devices::Host, Index > vectorHost( matrixHost.getNumRows() );

		if( vectorName == "none" )
		calculHostVecOne( matrixHost, vectorHost, vectorName );
		else
		if( vectorName == "none" ){
		calculHostVecOne( matrix, vector, vectorName );
		}else
		{
		readVector( vectorHost, vectorName );
		readVector( vector, vectorName );
		if( verbose > 1 )
		cout << "reading vector " << vectorName << endl;
		}

		#ifdef HAVE_MPI
		cutMatrixVectorMPI( matrixHost, vectorHost );
		cutMatrixVectorMPI( matrix, vector );
		#endif
		if( verbose > 2 )
		matrixHost.showMatrix();
		matrix.showMatrix();

		if( verbose > 2 )
		cout << vectorHost << endl;

		// Copy from CPU into matrix dependent on template Device
		vector = vectorHost;
		matrix = matrixHost;
		cout << vector << endl;
		}

		template < typename Real, typename Index >
		@@ -200,7 +202,7 @@ void calculHostVecOne( Matrix< Real, Devices::Host, Index >& matrix,
		{
		pom += matrix.getElement(i,j);
		}
		vector[i] = pom;
		vector.setElement( i, pom );
		}

		/*ofstream outdata; // outdata is like cin
		@@ -275,6 +277,8 @@ void cutMatrixVectorMPI( Matrix< Real, Devices::Host, Index >& matrix,
		matrixTemp.setElement(i,j,0);
		}
		}
		//if( j % 1000 == 0 )
		// printf("%d: cutting col %d\n", processID, j );
		}
		for( int i = 0; i < numRowsCUT; i++ ){
		if( i + numRowsCUT * processID < matrix.getNumRows() ){
		@@ -283,7 +287,7 @@ void cutMatrixVectorMPI( Matrix< Real, Devices::Host, Index >& matrix,
		vectorTemp.setElement( i, 0 );
		}
		}

		matrix.reset(); vector.reset();
		matrix = matrixTemp;
		vector = vectorTemp;
		}

src/Benchmarks/GEM/gem/GEMdevice.h

+1 −1

Original line number	Diff line number	Diff line
		#include "TNL/Cuda/MemoryHelpers.h"
		#include <TNL/Cuda/MemoryHelpers.h>
		#define DEBUG 0