MPI with device to device for non-pivoting gem done. (46164f84) · Commits · TNL / tnl-dev

src/Benchmarks/GEM/Makefile.inc

+1 −1

Original line number	Diff line number	Diff line
		@@ -5,7 +5,7 @@ PROJECT_NAME = tnl-gem
		TNL_HEADERS = ${HOME}/.local/include
		INSTALL_DIR = ${HOME}/.local
		WITH_CUDA = yes
		WITH_MPI = no
		WITH_MPI = yes
		WITH_OPENMP = yes
		WITH_DEBUG = yes
		MPI_FLAGT =

src/Benchmarks/GEM/Matrix/Matrix.h

+8 −0

Original line number	Diff line number	Diff line
		@@ -79,6 +79,14 @@ class Matrix
		*/
		void setRow( Index row, Index col, Real* mainRow, Index size );

		/**
		* Sets ROW on row and starting column into matrix A. Can be
		* called from host for host and device vector.
		*
		* @param row and column, mainRow is vector with size to be filled with values.
		* @return void.
		*/
		void setRow( Index row, Index col, Vector& mainRow );
		/**
		* Returns ROW on row and starting column. Can be
		* called from host if the matrix is on device and also can be called from

src/Benchmarks/GEM/Matrix/Matrix_impl.h

+24 −0

Original line number	Diff line number	Diff line
		@@ -152,6 +152,30 @@ void Matrix< Real, Device, Index >::setRow( Index row, Index col, Real* mainRow,
		#endif
		}

		template < typename Real,
		typename Device,
		typename Index >
		void Matrix< Real, Device, Index >::setRow( Index row, Index col, Vector& mainRow )
		{
		TNL_ASSERT( row > -1 && col > -1, std::cerr << "Matrix cannot have egative row nor negative column!");
		TNL_ASSERT( row < rows && col < columns, std::cerr << "Matrix dosn't have that much rows or columns!");
		if( std::is_same< Device, TNL::Devices::Host >::value )
		{
		#if DEBUG
		printf("On CPU\n");
		#endif
		for( int i = 0; i < mainRow.getSize()-1; i++ )
		this->setElement( row, col + i, mainRow[ i ] );
		}
		#ifdef HAVE_CUDA
		if( std::is_same< Device, TNL::Devices::Cuda >::value )
		{
		for( int i = 0; i < mainRow.getSize()-1; i++ )
		this->data.setElement( row*TNL::roundToMultiple( this->columns, TNL::Cuda::getWarpSize() ) + col + i, mainRow.getElement( i ) );
		}
		#endif
		}

		template < typename Real,
		typename Device,
		typename Index >

src/Benchmarks/GEM/gem/GEMdeviceMPI.h

+56 −41

Original line number	Diff line number	Diff line
		#define DEBUG 0
		#include <fstream> // saving and loading vector.txt
		#include <string> // input from cmd
		#include <chrono> // clock for debugging

		template < typename Real, typename Index >
		void saveVec( Real* mainRow, Index size, int processID, Index colPointerMain )
		@@ -142,6 +143,7 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot
		// Main pointer to row, over all parts of matrices, colPointerMain in (0 - number of rows)
		Index colPointerMain = 0;

		double duration[ this->A.getNumColumns() ];
		// Main cycle for all rows across all MPI parts, vector x is the only one with full size on MPI, or use A.getNumColumns() for rectangular matrices.
		while( colPointerMain < x.getSize() ){
		#ifdef HAVE_MPI
		@@ -244,9 +246,7 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot
		// Now when every process has the ProcessMax of pivoting row across all processes
		// we can send pivoting row to all processes from ProcessMax
		// mainRow stores pivoting row
		Real* mainRow;
		int size = this->A.getNumColumns() - colPointerMain + 1;
		mainRow = new Real[size];
		Array mainRow( this->A.getNumColumns() - colPointerMain + 1 );


		// If ProcessMax isn't the main process that contains colPointerMain then ProcessMax sets mainRow itself.
		@@ -256,8 +256,8 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot
		{
		if( processID == ProcessMax )
		{
		this->A.getRow( Possition, colPointerMain, mainRow, size );
		mainRow[ size-1 ] = this->b.getElement( Possition );
		this->A.getRow( Possition, colPointerMain, mainRow );
		mainRow.setElement( mainRow.getSize()-1, this->b.getElement( Possition ) );
		}
		} else {
		if( colPointerMain/this->A.getNumRows() == processID ){
		@@ -277,22 +277,22 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot
		TNL_CHECK_CUDA_DEVICE;
		}

		this->A.getRow( colPointer, colPointerMain, mainRow, size );
		mainRow[ size-1 ] = this->b.getElement( colPointer );
		this->A.getRow( colPointer, colPointerMain, mainRow );
		mainRow.setElement( mainRow.getSize()-1, this->b.getElement( colPointer ) );
		}
		}

		// Broad casting the pivoting row to all processes
		#ifdef HAVE_MPI
		MPI_Barrier(MPI_COMM_WORLD);
		TNL::Communicators::MpiCommunicator::Bcast( mainRow, size, ProcessMax, MPI_COMM_WORLD);
		TNL::Communicators::MpiCommunicator::Bcast( mainRow.getData(), mainRow.getSize()-1, ProcessMax, MPI_COMM_WORLD);
		//if( colPointerMain%100 == 0 )
		// saveVec( mainRow, size, processID, colPointerMain );
		// saveVec( mainRow, mainRow.getSize(), processID, colPointerMain );
		if( verbose > 1 )
		{
		printf( "%d: [", processID);
		for( int i = 0; i < size; i++ )
		printf( "%.2f ", mainRow[ i ] );
		for( int i = 0; i < mainRow.getSize(); i++ )
		printf( "%.2f ", mainRow.getElement( i ) );
		printf("]\n");
		}

		@@ -302,61 +302,72 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot
		if( ProcessMax != colPointerMain/this->A.getNumRows() )
		{
		Real *mainRowSwap;
		mainRowSwap = new Real[size];
		mainRowSwap = new Real[mainRow.getSize()];


		if( processID == ProcessMax )
		{
		TNL::Communicators::MpiCommunicator::Recv( mainRowSwap, size, colPointerMain/this->A.getNumRows(), 0 );
		this->A.setRow( Possition, colPointerMain, mainRowSwap, size );
		this->b.setElement( Possition, mainRowSwap[ size-1 ] );
		TNL::Communicators::MpiCommunicator::Recv( mainRowSwap, mainRow.getSize(), colPointerMain/this->A.getNumRows(), 0 );
		this->A.setRow( Possition, colPointerMain, mainRowSwap, mainRow.getSize() );
		this->b.setElement( Possition, mainRowSwap[ mainRow.getSize()-1 ] );
		}
		else if( processID == colPointerMain/this->A.getNumRows() )
		{
		this->A.getRow( colPointer, colPointerMain, mainRowSwap, size );
		mainRowSwap[ size-1 ] = this->b.getElement( colPointer );
		this->A.getRow( colPointer, colPointerMain, mainRowSwap, mainRow.getSize() );
		mainRowSwap[ mainRow.getSize()-1 ] = this->b.getElement( colPointer );

		TNL::Communicators::MpiCommunicator::Send( mainRowSwap, size, ProcessMax, 0 );
		this->A.setRow( colPointer, colPointerMain, mainRow, size );
		this->b.setElement( colPointer, mainRow[ size-1 ] );
		TNL::Communicators::MpiCommunicator::Send( mainRowSwap, mainRow.getSize(), ProcessMax, 0 );
		this->A.setRow( colPointer, colPointerMain, mainRow );
		this->b.setElement( colPointer, mainRow[ mainRow.getSize()-1 ] );
		}
		delete []mainRowSwap;
		}
		#endif
		// Main kernel works with vector as a main row, so all processes has to set mainRowVec.
		TNL::Containers::Vector< Real, TNL::Devices::Host, Index > mainRowVecHost( mainRow, size );
		mainRowVec = mainRowVecHost;
		delete []mainRow;
		//TNL::Containers::Vector< Real, TNL::Devices::Host, Index > mainRowVecHost( mainRow, size );
		mainRowVec = mainRow;
		//delete []mainRow;
		}
		else // without pivoting
		{
		std::clock_t start;
		start = std::clock();
		#ifdef HAVE_MPI
		Real* mainRow;
		int size = this->A.getNumColumns() - colPointerMain + 1;
		mainRow = new Real[size];
		//if( processID == 0 )
		//printf( "Initializing mainRow!\n");

		Real* data;

		if( colPointerMain/this->A.getNumRows() == processID ){
		this->A.getRow( colPointer, colPointerMain, mainRow, size );
		mainRow[ size-1 ] = this->b.getElement( colPointer );
		this->A.getRow( colPointer, colPointerMain, mainRowVec );
		mainRowVec.setElement( mainRowVec.getSize()-1, this->b.getElement( colPointer ) );

		data = mainRowVec.getData();
		}
		else
		{
		cudaMalloc( &data, mainRowVec.getSize() * sizeof(Real) );
		}

		TNL::Communicators::MpiCommunicator::Bcast( mainRow, size, colPointerMain/this->A.getNumRows(), MPI_COMM_WORLD);
		//printf( "brodcasting mainRow!\n");
		TNL::Communicators::MpiCommunicator::Bcast( data, mainRowVec.getSize(), colPointerMain/this->A.getNumRows(), MPI_COMM_WORLD);

		if( verbose > 2 )
		mainRowVec.bind( data, this->A.getNumColumns() - colPointerMain + 1 );

		/*if( verbose > 2 )
		{
		printf( "%d: [", processID);
		for( int i = 0; i < size; i++ )
		printf( "%.2f ", mainRow[ i ] );
		printf("]\n");
		for( int i = 0; i < numOfProcesses; i++ )
		if( i == processID ){
		std::cout << mainRowVec << std::endl;
		}
		MPI_Barrier(MPI_COMM_WORLD);
		}*/

		TNL::Containers::Vector< Real, TNL::Devices::Host, Index > mainRowVecHost( mainRow, size );
		mainRowVec = mainRowVecHost;
		delete []mainRow;
		#else
		this->A.getRow(colPointer, colPointerMain, mainRowVec );
		mainRowVec.setElement( mainRowVec.getSize() - 1, this->b.getElement( colPointer ) );
		#endif
		duration[ colPointerMain ] = ( std::clock() - start ) / (double) CLOCKS_PER_SEC;
		}
		if( verbose > 1 )
		{
		@@ -426,7 +437,11 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot
		// increment colPointerMain for next while passage
		colPointerMain++;
		}

		double time;
		for( int i = 0; i < this->A.getNumColumns(); i++ )
		time += duration[ i ];
		time = time/this->A.getNumColumns();
		printf("%d: copy MPI part: %.8f\n", processID, time );
		// delete all used variables
		cudaFree(pivot);
		cudaFree( devMat );

src/Benchmarks/GEM/gem/GEMkernelsMPI.h

+7 −0

Original line number	Diff line number	Diff line
		@@ -3,6 +3,13 @@
		//TODO: Real

		/**********************REDUCTION MAX****************************************/
		template <typename Real>
		__global__ void showData( Real* data, int size, int processID ){
		printf("%d: [ ", processID );
		for( int i = 0; i < size; i++ )
		printf("%.2f ", data[i] );
		printf(" ]\n");
		}

		template <typename Real >
		__inline__ __device__ void warpReduceArgMax(Real& val, int& index) {