MPI without pivoting with unknown error depending on !!!loops!!! (dc689b13) · Commits · TNL / tnl-dev

src/Benchmarks/GEM/Makefile

+6 −6

Original line number	Diff line number	Diff line
		@@ -41,14 +41,14 @@ clean_curdir:
		dist: clean
		tar zcvf $(PROJECT_NAME)-src.tgz $(SUBDIRS) $(FILES)

		$(TARGETS): % : %.o
		$(CXX) $(LDFLAGS) -o $@ $< $(LDLIBS)
		#$(TARGETS): % : %.o
		# $(CXX) $(LDFLAGS) -L /home/maty/.openmpi/lib -lmpi -o $@ $< $(LDLIBS)

		$(CUDA_TARGETS): % : %.cu.o
		$(CUDA_CXX) $(CUDA_LDFLAGS) -o $@ $< $(CUDA_LDLIBS)
		$(CUDA_CXX) $(CUDA_LDFLAGS) -I /home/maty/.openmpi/include -L /home/maty/.openmpi/lib -lmpi -o $@ $< $(CUDA_LDLIBS)

		$(SOURCES:%.cpp=%.o): %.o: %.cpp
		$(CXX) $(CPPFLAGS) $(CXXFLAGS) -c -o $@ $<
		#$(SOURCES:%.cpp=%.o): %.o: %.cpp
		# $(CXX) $(CPPFLAGS) $(CXXFLAGS) -c -o $@ $<

		$(CUDA_SOURCES:%.cu=%.cu.o): %.cu.o : %.cu
		$(CUDA_CXX) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) -c -o $@ $<
		$(CUDA_CXX) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) -I /home/maty/.openmpi/include -c -o $@ $<

src/Benchmarks/GEM/Makefile.inc

+3 −2

Original line number	Diff line number	Diff line
		@@ -17,6 +17,7 @@ CUDA_ARCH = auto
		# Set-up compilers
		CXX = g++
		CUDA_CXX =/usr/local/cuda-10.1/bin/nvcc
		#CUDA_CXX =/home/maty/.openmpi/bin/mpicc

		# Set-up CXX_FLAGS
		CXXFLAGS = -pthread -Wall -Wno-unused-local-typedefs -Wno-unused-variable -Wno-unknown-pragmas -std=c++14 -I$(TNL_HEADERS)
		@@ -27,10 +28,10 @@ else
		endif

		# Set-up CUDA_CXXFLAGS
		CUDA_CXXFLAGS =-Wno-deprecated-gpu-targets --expt-relaxed-constexpr --expt-extended-lambda -DHAVE_CUDA --std c++14 -I$(TNL_HEADERS)
		CUDA_CXXFLAGS =-Wno-deprecated-gpu-targets --expt-relaxed-constexpr --expt-extended-lambda -DHAVE_CUDA -DHAVE_MPI --std c++14 -I$(TNL_HEADERS)

		ifeq ( $(WITH_CUDA), yes )
		CUDA_CXXFLAGS += -DHAVE_CUDA
		CUDA_CXXFLAGS += -DHAVE_CUDA -DHAVE_MPI
		ifeq ( $(CUDA_ARCH), auto )
		CUDA_CXXFLAGS += `tnl-cuda-arch`
		else

src/Benchmarks/GEM/Matrix/Matrix.h

+23 −3

Original line number	Diff line number	Diff line
		@@ -51,7 +51,7 @@ class Matrix
		void setElement( Index row, Index col, Real value );

		/**
		* Returns element on row and column but on device that it's saved! Cannot be
		* Returns element on row and column. Cannot be
		* called from host if the matrix is on device and also cannot be called from
		* device if the matrix is on host.
		*
		@@ -60,11 +60,31 @@ class Matrix
		*/
		__cuda_callable__ Real getElement( Index row, Index col ) const;

		/**
		* Returns ROW on row and starting column. Cannot be
		* called from host if the matrix is on device and also cannot be called from
		* device if the matrix is on host.
		*
		* @param row and column, mainRow is array with size to be filled with values.
		* @return void.
		*/
		void getRow( Index row, Index col, Real* mainRow, Index size );

		/**
		* Returns ROW on row and starting column. Can be
		* called from host if the matrix is on device and also can be called from
		* device if the matrix is on host. Values are saved in vector mainRow
		*
		* @param row and column, mainRow is output vector.
		* @return void
		*/
		void getRow( Index row, Index col, Vector& mainRow );

		/**
		* Saves data array into array parameter even in cross device function matrix.
		* Data array is huge on device for coalesced memory access condition!
		*
		* @param VectorView eather on host or device, depends where we want to store the data.
		* @param Vector eather on host or device, depends where we want to store the data.
		* @return void
		*/
		template < typename Device1 >

src/Benchmarks/GEM/Matrix/Matrix_impl.h

+57 −8

Original line number	Diff line number	Diff line

		//#include "Matrix.h"

		#define DEBUG 0

		template < typename Real,
		@@ -92,12 +95,65 @@ Real Matrix< Real, Device, Index >::getElement( Index row, Index col ) const
		return 1;
		}

		template < typename Real,
		typename Device,
		typename Index >
		void Matrix< Real, Device, Index >::getRow( Index row, Index col, Real* mainRow, Index size )
		{
		TNL_ASSERT( row > -1 && col > -1, std::cerr << "Matrix cannot have egative row nor negative column!");
		TNL_ASSERT( row < rows && col < columns, std::cerr << "Matrix dosn't have that much rows or columns!");
		if( std::is_same< Device, TNL::Devices::Host >::value )
		{
		#if DEBUG
		printf("On CPU\n");
		#endif
		for( int i = 0; i < size-1; i++ )
		mainRow[ i ] = this->getElement( row, col + i );
		}
		#ifdef HAVE_CUDA
		if( std::is_same< Device, TNL::Devices::Cuda >::value )
		{
		for( int i = 0; i < size-1; i++ )
		mainRow[ i ] = this->data.getElement( row*TNL::roundToMultiple( this->columns, TNL::Cuda::getWarpSize() ) + col + i );
		}
		#endif
		}

		template < typename Real,
		typename Device,
		typename Index >
		void Matrix< Real, Device, Index >::getRow( Index row, Index col, Vector& mainRow )
		{
		TNL_ASSERT( row > -1 && col > -1, std::cerr << "Matrix cannot have egative row nor negative column!");
		TNL_ASSERT( row < rows && col < columns, std::cerr << "Matrix dosn't have that much rows or columns!");
		if( std::is_same< Device, TNL::Devices::Host >::value )
		{
		#if DEBUG
		printf("On CPU\n");
		#endif
		for( int i = 0; i < mainRow.getSize()-1; i++ )
		mainRow[ i ] = this->getElement( row, col + i );
		}
		#ifdef HAVE_CUDA
		if( std::is_same< Device, TNL::Devices::Cuda >::value )
		{
		TNL::Containers::Vector< Real, TNL::Devices::Host, Index > tempVec( mainRow.getSize() );
		for( int i = 0; i < mainRow.getSize()-1; i++ )
		tempVec[ i ] = this->data.getElement( row*TNL::roundToMultiple( this->columns, TNL::Cuda::getWarpSize() ) + col + i );
		mainRow = tempVec;
		}
		#endif
		}

		template < typename Real,
		typename Device,
		typename Index >
		template < typename Device1 >
		void Matrix< Real, Device, Index >::getData( TNL::Containers::Vector< Real, Device1, Index >& dataOut )
		{
		/*dataOut.setSize( this->data.getSize() );
		for( int i = 0; i < this->data.getSize(); i++ )
		dataOut.setElement( i , this->data.getElement( i ));*/
		dataOut = this->data;
		}

		@@ -161,31 +217,24 @@ Matrix< Real, Device, Index >::operator=( Matrix< Real, Device2, Index>& matrix
		matrix.getData( pom );
		if( std::is_same< Device, Device2 >::value )
		{
		//printf("copy host to host or device to device\n");
		printf("copy host to host or device to device\n");
		this->data = pom;
		}
		#ifdef HAVE_CUDA
		else if( std::is_same< Device, TNL::Devices::Host >::value )
		{
		#if DEBUG
		printf("copy device to host\n");
		#endif
		for( int i = 0; i < this->rows; i++ )
		for( int j = 0; j < this->columns; j++ )
		this->setElement(i,j, pom[ i*TNL::roundToMultiple( this->rows, TNL::Cuda::getWarpSize() ) + j ] );
		}
		else if( std::is_same< Device, TNL::Devices::Cuda >::value )
		{
		#if DEBUG
		printf("copy host to device\n");
		printf("data size alocated: %d\n", this->data.getSize() );
		#endif
		for( int i = 0; i < this->getNumRows(); i++ )
		for( int j = 0; j < this->getNumColumns(); j++ )
		{
		#if DEBUG
		printf("copying element i,j = %d, %d with value %.4f\n", i, j, pom.getElement( i * this->columns + j ) );
		#endif
		this->data.setElement( i*TNL::roundToMultiple( this->columns, TNL::Cuda::getWarpSize() ) + j,
		pom.getElement( i * this->columns + j ) );
		}

src/Benchmarks/GEM/gem.h

+199 −53

Original line number	Diff line number	Diff line
		@@ -11,9 +11,15 @@
		#include "TNL/tnl-dev/src/TNL/Cuda/MemoryHelpers.h"
		#include "TNL/tnl-dev/src/TNL/Devices/Host.h"

		#ifdef HAVE_MPI
		#include "TNL/tnl-dev/src/TNL/Communicators/MpiCommunicator.h"
		#include <mpi.h>
		#endif

		#include "gem/GEMkernels.h"
		#include "TNL/Cuda/MemoryHelpers.h"
		#define COMPARE_RESULTS true;

		// Prihodit GEMdeice do GEM a dělat dělení na device až tam -> přidat parametr pivoting

		using namespace TNL;
		using namespace TNL::Containers;
		@@ -24,6 +30,12 @@ template < typename Real, typename Index >
		void calculHostVecOne( Matrix< Real, Devices::Host, Index >& matrix,
		Vector< Real, Devices::Host, Index >& vector, const String& vectorName );

		#ifdef HAVE_MPI
		template< typename Real, typename Index >
		void cutMatrixVectorMPI( Matrix< Real, Devices::Host, Index >& matrix,
		Vector< Real, Devices::Host, Index >& vector );
		#endif

		template < typename Real, typename Index >
		void readVector( Vector< Real, Devices::Host, Index >& vector_host, const String& vectorName );

		@@ -31,7 +43,8 @@ void readVector( Vector< Real, Devices::Host, Index >& vector_host, const String
		template < typename Real, typename Index, typename Device >
		void readMatrixVector( Matrix< Real, Device, Index>& matrix,
		Vector< Real, Device, Index >& vector,
		const String& matrixName, const String& vectorName, const int verbose );
		const String& matrixName, const String& vectorName,
		Index &rows, Index &nonzeros, const int verbose );


		template < typename Real,
		@@ -42,12 +55,20 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve
		{
		typedef Matrix< Real, Device, Index > MatrixType;
		typedef Vector< Real, Device, Index > VectorType;
		int processID = 0; // MPI processID, without mpi == 0
		#ifdef HAVE_MPI
		MPI_Comm_rank( MPI_COMM_WORLD, &processID );
		Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD );
		#endif

		MatrixType matrix;
		VectorType vector;

		readMatrixVector( matrix, vector, matrixName, vectorName, verbose );
		VectorType vectorResult( matrix.getNumRows() );
		Index rows, nonzeros;
		readMatrixVector( matrix, vector, matrixName, vectorName, rows, nonzeros, verbose );
		VectorType vectorResult( rows );
		vectorResult.setValue(0);
		MatrixType matrixComp;
		VectorType vectorComp( vector );

		// Computation
		double* time;
		@@ -58,22 +79,67 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve
		cout << "Starting computation on " << device << endl;
		for( int i = 0; i < loops; i++ )
		{
		MatrixType matrixComp = matrix;
		VectorType vectorComp( vector );
		vectorResult.setValue( 0 );

		printf("process %d is waiting in barrier with i = %d, loops = %d\n", processID, i, loops );
		#ifdef HAVE_MPI
		MPI_Barrier( MPI_COMM_WORLD );
		#endif
		printf("process %d is behind barrier with i = %d, loops = %d and going to calculate next loop!\n", processID, i, loops );
		printf("process %d is in loop %d from all loops %d\n", processID, i, loops );
		#ifdef HAVE_CUDA
		printf( "%d process:\n", processID );
		showMatrix<<< 1, 1 >>>( matrix );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		std::cout << vector << endl;
		cout << vectorResult << endl;
		#endif

		matrixComp = matrix;

		printf("%d: matrix coppied\n", processID );
		#ifdef HAVE_MPI
		MPI_Barrier( MPI_COMM_WORLD );
		#endif
		#ifdef HAVE_CUDA
		printf( "%d process:\n", processID );
		showMatrix<<< 1, 1 >>>( matrixComp );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		#endif
		cout << processID << ":" << vector << endl;
		cout << processID << ":" << vectorComp << endl;
		#ifdef HAVE_MPI
		Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD );
		#endif
		printf("%d:copiing Vector \n", processID);
		//vectorComp = vector;

		#ifdef HAVE_MPI
		Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD );
		#endif
		//vectorResult.setValue( 0 );

		#ifdef HAVE_MPI
		Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD );
		#endif
		GEM< Real, Device, Index > gem( matrixComp, vectorComp );

		std::clock_t start;
		#ifdef HAVE_MPI
		Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD );
		#endif
		double duration;

		std::clock_t start;
		start = std::clock();

		if( verbose > 1 )
		if( verbose > 1 && processID == 0 )
		cout << "starting computation number " << i+1 << endl;
		gem.solve( vectorResult, pivoting, verbose );

		gem.solve( vectorResult, (String)"no", verbose );
		duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC;
		#ifdef HAVE_MPI
		Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD );
		#endif
		if( processID == 0 )
		{
		time[i] = duration;

		if( vectorName == "none" )
		@@ -88,7 +154,11 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve
		printf( "Norm in %d calculation is %.4f\n", i+1, l2norm);
		}
		}
		if( verbose > 1 )
		#ifdef HAVE_MPI
		Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD );
		#endif
		}
		if( verbose > 1 && processID == 0 )
		printf("\n ... done!\n");

		double timeMean = 0;
		@@ -96,19 +166,34 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve
		timeMean += time[i];
		timeMean /= loops;

		if( processID == 0 )
		printf("%20s %15d %15d %20s %15s %15s %10d %15.5f %15.5f\n", matrixName.c_str(),
		matrix.getNumRows(), matrix.getNumNonzeros(), vectorName == "none" ? "-":vectorName.c_str(),
		rows, nonzeros, vectorName == "none" ? "-":vectorName.c_str(),
		device.c_str(), typeid(Real).name() == (string)"f" ? "float":"double", loops, timeMean, error);

		delete []time;

		#ifdef HAVE_MPI
		Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD );
		#endif

		if( processID == 0 ){
		printf("%d: returning\n", processID );
		return vectorResult;

		}
		else{
		printf("%d: returning\n", processID );
		vectorResult.setValue( 0 );
		return vectorResult;
		}
		}

		template < typename Real, typename Index, typename Device >
		void readMatrixVector( Matrix< Real, Device, Index>& matrix,
		Vector< Real, Device, Index >& vector,
		const String& matrixName, const String& vectorName, const int verbose )
		const String& matrixName, const String& vectorName,
		Index &rows, Index &nonzeros, const int verbose )
		{
		typedef Matrix< Real, Devices::Host, Index> MatrixHost;
		MatrixHost matrixHost;
		@@ -117,9 +202,9 @@ void readMatrixVector( Matrix< Real, Device, Index>& matrix,
		// Get matrix
		Matrices::MatrixReader< MatrixHost > m;
		m.readMtxFile( "./test-matrices/" + matrixName, matrixHost, verbose );
		if( verbose > 2 )
		matrixHost.showMatrix();
		matrix = matrixHost;
		rows = matrixHost.getNumRows();
		nonzeros = matrixHost.getNumNonzeros();


		// Get vector
		Vector< Real, Devices::Host, Index > vectorHost( matrixHost.getNumRows() );
		@@ -132,9 +217,19 @@ void readMatrixVector( Matrix< Real, Device, Index>& matrix,
		if( verbose > 1 )
		cout << "reading vector " << vectorName << endl;
		}

		#ifdef HAVE_MPI
		cutMatrixVectorMPI( matrixHost, vectorHost );
		#endif
		if( verbose > 2 )
		matrixHost.showMatrix();

		if( verbose > 2 )
		cout << vectorHost << endl;

		// Copy from CPU into matrix dependent on template Device
		vector = vectorHost;
		matrix = matrixHost;

		}

		@@ -186,3 +281,54 @@ void readVector( Vector< Real, Devices::Host, Index >& vector_host, const String
		}
		inFile.close();
		}


		#ifdef HAVE_MPI
		template< typename Real, typename Index >
		void cutMatrixVectorMPI( Matrix< Real, Devices::Host, Index >& matrix,
		Vector< Real, Devices::Host, Index >& vector )
		{
		Matrix< Real, Devices::Host, Index> matrixTemp;
		Vector< Real, Devices::Host, Index > vectorTemp;

		int processID;
		int numOfProcesses;

		MPI_Comm_rank( MPI_COMM_WORLD, &processID );
		MPI_Comm_size( MPI_COMM_WORLD, &numOfProcesses );

		if( processID == 0 )
		{
		printf( "%d: %d\n", numOfProcesses, processID );
		matrix.showMatrix();
		cout << vector << endl;
		}


		Index numRowsCUT = TNL::roundUpDivision( matrix.getNumRows(), numOfProcesses );
		matrixTemp.setDimensions( numRowsCUT, matrix.getNumRows() );
		vectorTemp.setSize( numRowsCUT );

		//printf( "%d: %d num of rows = %d\n", numOfProcesses, processID, numRowsCUT );

		for( int j = 0; j < matrix.getNumColumns(); j++ ){
		for( int i = 0; i < numRowsCUT; i++ ){
		if( i + numRowsCUT * processID < matrix.getNumRows() ){
		matrixTemp.setElement( i, j, matrix.getElement( i + numRowsCUT * processID, j ) );
		}else{
		matrixTemp.setElement(i,j,0);
		}
		}
		}
		for( int i = 0; i < numRowsCUT; i++ ){
		if( i + numRowsCUT * processID < matrix.getNumRows() ){
		vectorTemp.setElement( i, vector.getElement( i + numRowsCUT * processID ) );
		}else{
		vectorTemp.setElement( i, 0 );
		}
		}

		matrix = matrixTemp;
		vector = vectorTemp;
		}
		#endif
		No newline at end of file