Error repaired, cuda MPI without pivoting working (519f8715) · Commits · TNL / tnl-dev

src/Benchmarks/GEM/Makefile

+7 −7

Original line number	Diff line number	Diff line
		include Makefile.inc

		SUBDIRS =
		HEADERS = "./TNL/src/TNL"
		HEADERS = "./TNL/tnl-dev/src/TNL"
		SOURCES = tml-gem.cpp
		CUDA_SOURCES = tnl-gem-cuda.cu
		TARGETS = tnl-gem
		@@ -41,14 +41,14 @@ clean_curdir:
		dist: clean
		tar zcvf $(PROJECT_NAME)-src.tgz $(SUBDIRS) $(FILES)

		#$(TARGETS): % : %.o
		# $(CXX) $(LDFLAGS) -L /home/maty/.openmpi/lib -lmpi -o $@ $< $(LDLIBS)
		$(TARGETS): % : %.o
		$(CXX) $(LDFLAGS) -o $@ $< $(LDLIBS)

		$(CUDA_TARGETS): % : %.cu.o
		$(CUDA_CXX) $(CUDA_LDFLAGS) -I /home/maty/.openmpi/include -L /home/maty/.openmpi/lib -lmpi -o $@ $< $(CUDA_LDLIBS)
		$(CUDA_CXX) $(CUDA_LDFLAGS) $(MPI_FLAGS) $(MPI_FLAGT) -o $@ $< $(CUDA_LDLIBS)

		#$(SOURCES:%.cpp=%.o): %.o: %.cpp
		# $(CXX) $(CPPFLAGS) $(CXXFLAGS) -c -o $@ $<
		$(SOURCES:%.cpp=%.o): %.o: %.cpp
		$(CXX) $(CPPFLAGS) $(CXXFLAGS) -c -o $@ $<

		$(CUDA_SOURCES:%.cu=%.cu.o): %.cu.o : %.cu
		$(CUDA_CXX) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) -I /home/maty/.openmpi/include -c -o $@ $<
		$(CUDA_CXX) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) $(MPI_FLAGS) -c -o $@ $<

src/Benchmarks/GEM/Makefile.inc

+10 −0

Original line number	Diff line number	Diff line
		@@ -5,8 +5,11 @@ PROJECT_NAME = tnl-gem
		TNL_HEADERS = ${HOME}/.local/include
		INSTALL_DIR = ${HOME}/.local
		WITH_CUDA = yes
		WITH_MPI = yes
		WITH_OPENMP = yes
		WITH_DEBUG = yes
		MPI_FLAGT =-L/home/maty/.openmpi/lib -lmpi
		MPI_FLAGS =-I/home/maty/.openmpi/include

		# If TNL is installed on your system, CUDA arch can be detected automatically using
		# a tool 'tnl-cuda-arch'. This is done by default if CUDA_ARCH is set to 'auto'.
		@@ -39,6 +42,13 @@ ifeq ( $(WITH_CUDA), yes )
		endif
		endif


		# Set-up MPI_FLAG
		ifeq ( $(WITH_MPI), yes )
		MPI_FLAGT =-L/home/maty/.openmpi/lib -lmpi
		MPI_FLAGS =-I/home/maty/.openmpi/include
		endif

		# Set-up CPPFLAGS
		CPPFLAGS = -MD -MP

src/Benchmarks/GEM/Matrix/Matrix.h

+1 −0

Original line number	Diff line number	Diff line
		@@ -26,6 +26,7 @@ class Matrix

		Matrix();
		Matrix( const Index rows, const Index columns );
		Matrix( Matrix< Real, Device, Index>& matrix );

		/**
		* Sets dimension for matrix of rows x columns elements and allocates memory

src/Benchmarks/GEM/Matrix/Matrix_impl.h

+13 −8

Original line number	Diff line number	Diff line

		//#include "Matrix.h"

		#define DEBUG 0

		template < typename Real,
		@@ -17,6 +15,17 @@ Matrix< Real, Device, Index >::Matrix( Index rows, Index columns )
		{
		this->setDimensions( rows, columns );
		}

		template < typename Real,
		typename Device,
		typename Index >
		Matrix< Real, Device, Index >::Matrix( Matrix< Real, Device, Index>& matrix )
		: rows( matrix.getNumRows() ), columns( matrix.getNumColumns() )
		{
		matrix.getData( this->data );
		}


		template < typename Real,
		typename Device,
		typename Index >
		@@ -164,9 +173,9 @@ template < typename Real,
		__cuda_callable__
		void Matrix< Real, Device, Index >::showMatrix()
		{
		for( int i = 0; i < rows; i++ )
		for( int i = 0; i < this->rows; i++ )
		{
		for( int j = 0; j < columns; j++ )
		for( int j = 0; j < this->columns; j++ )
		printf("%.4f ", this->getElement(i,j));
		printf("\n");
		}
		@@ -217,21 +226,17 @@ Matrix< Real, Device, Index >::operator=( Matrix< Real, Device2, Index>& matrix
		matrix.getData( pom );
		if( std::is_same< Device, Device2 >::value )
		{
		printf("copy host to host or device to device\n");
		this->data = pom;
		}
		#ifdef HAVE_CUDA
		else if( std::is_same< Device, TNL::Devices::Host >::value )
		{
		printf("copy device to host\n");
		for( int i = 0; i < this->rows; i++ )
		for( int j = 0; j < this->columns; j++ )
		this->setElement(i,j, pom[ i*TNL::roundToMultiple( this->rows, TNL::Cuda::getWarpSize() ) + j ] );
		}
		else if( std::is_same< Device, TNL::Devices::Cuda >::value )
		{
		printf("copy host to device\n");
		printf("data size alocated: %d\n", this->data.getSize() );
		for( int i = 0; i < this->getNumRows(); i++ )
		for( int j = 0; j < this->getNumColumns(); j++ )
		{

src/Benchmarks/GEM/gem.h

+15 −55

Original line number	Diff line number	Diff line
		#include <chrono>
		#include <thread>

		#include "Matrix/Matrix.h"
		#include "gem/GEM.h"
		@@ -66,9 +68,6 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve
		Index rows, nonzeros;
		readMatrixVector( matrix, vector, matrixName, vectorName, rows, nonzeros, verbose );
		VectorType vectorResult( rows );
		vectorResult.setValue(0);
		MatrixType matrixComp;
		VectorType vectorComp( vector );

		// Computation
		double* time;
		@@ -77,51 +76,18 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve

		if( verbose > 1 )
		cout << "Starting computation on " << device << endl;
		/*char hostname[256];
		gethostname(hostname, sizeof(hostname));
		printf("PID %d on %s ready for attach\n", getpid(), hostname);*/

		for( int i = 0; i < loops; i++ )
		{
		printf("process %d is waiting in barrier with i = %d, loops = %d\n", processID, i, loops );
		#ifdef HAVE_MPI
		MPI_Barrier( MPI_COMM_WORLD );
		#endif
		printf("process %d is behind barrier with i = %d, loops = %d and going to calculate next loop!\n", processID, i, loops );
		printf("process %d is in loop %d from all loops %d\n", processID, i, loops );
		#ifdef HAVE_CUDA
		printf( "%d process:\n", processID );
		showMatrix<<< 1, 1 >>>( matrix );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		std::cout << vector << endl;
		cout << vectorResult << endl;
		#endif

		matrixComp = matrix;

		printf("%d: matrix coppied\n", processID );
		#ifdef HAVE_MPI
		MPI_Barrier( MPI_COMM_WORLD );
		#endif
		#ifdef HAVE_CUDA
		printf( "%d process:\n", processID );
		showMatrix<<< 1, 1 >>>( matrixComp );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		#endif
		cout << processID << ":" << vector << endl;
		cout << processID << ":" << vectorComp << endl;
		#ifdef HAVE_MPI
		Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD );
		#endif
		printf("%d:copiing Vector \n", processID);
		//vectorComp = vector;

		#ifdef HAVE_MPI
		Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD );
		#endif
		//vectorResult.setValue( 0 );

		#ifdef HAVE_MPI
		Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD );
		#endif
		MatrixType matrixComp = matrix;
		VectorType vectorComp( vector );
		vectorResult.setValue( 0 );
		GEM< Real, Device, Index > gem( matrixComp, vectorComp );

		#ifdef HAVE_MPI
		@@ -135,9 +101,8 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve

		gem.solve( vectorResult, (String)"no", verbose );
		duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC;
		#ifdef HAVE_MPI
		Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD );
		#endif


		if( processID == 0 )
		{
		time[i] = duration;
		@@ -154,9 +119,6 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve
		printf( "Norm in %d calculation is %.4f\n", i+1, l2norm);
		}
		}
		#ifdef HAVE_MPI
		Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD );
		#endif
		}
		if( verbose > 1 && processID == 0 )
		printf("\n ... done!\n");
		@@ -176,14 +138,13 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve
		#ifdef HAVE_MPI
		Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD );
		#endif

		if( processID == 0 ){
		printf("%d: returning\n", processID );
		//printf("%d: returning\n", processID );
		return vectorResult;

		}
		else{
		printf("%d: returning\n", processID );
		//("%d: returning\n", processID );
		vectorResult.setValue( 0 );
		return vectorResult;
		}
		@@ -230,7 +191,6 @@ void readMatrixVector( Matrix< Real, Device, Index>& matrix,
		// Copy from CPU into matrix dependent on template Device
		vector = vectorHost;
		matrix = matrixHost;

		}

		template < typename Real, typename Index >
		@@ -297,13 +257,13 @@ void cutMatrixVectorMPI( Matrix< Real, Devices::Host, Index >& matrix,
		MPI_Comm_rank( MPI_COMM_WORLD, &processID );
		MPI_Comm_size( MPI_COMM_WORLD, &numOfProcesses );

		if( processID == 0 )
		/*if( processID == 0 )
		{
		printf( "%d: %d\n", numOfProcesses, processID );
		matrix.showMatrix();
		cout << vector << endl;
		}

		*/

		Index numRowsCUT = TNL::roundUpDivision( matrix.getNumRows(), numOfProcesses );
		matrixTemp.setDimensions( numRowsCUT, matrix.getNumRows() );