Readable code, MPI pivoting working, makefile.inc switch WITH_MPI yes/no made (292bc421) · Commits · TNL / tnl-dev

src/Benchmarks/GEM/Makefile.inc

+15 −15

Original line number	Diff line number	Diff line
		@@ -5,11 +5,13 @@ PROJECT_NAME = tnl-gem
		TNL_HEADERS = ${HOME}/.local/include
		INSTALL_DIR = ${HOME}/.local
		WITH_CUDA = yes
		WITH_MPI = yes
		WITH_MPI = no
		WITH_OPENMP = yes
		WITH_DEBUG = yes
		MPI_FLAGT =-L/home/maty/.openmpi/lib -lmpi
		MPI_FLAGS =-I/home/maty/.openmpi/include
		MPI_FLAGT =
		MPI_FLAGS =
		#MPI_FLAGT =-L/home/maty/.openmpi/lib -lmpi
		#MPI_FLAGS =-I/home/maty/.openmpi/include

		# If TNL is installed on your system, CUDA arch can be detected automatically using
		# a tool 'tnl-cuda-arch'. This is done by default if CUDA_ARCH is set to 'auto'.
		@@ -20,7 +22,6 @@ CUDA_ARCH = auto
		# Set-up compilers
		CXX = g++
		CUDA_CXX =/usr/local/cuda-10.1/bin/nvcc
		#CUDA_CXX =/home/maty/.openmpi/bin/mpicc

		# Set-up CXX_FLAGS
		CXXFLAGS = -pthread -Wall -Wno-unused-local-typedefs -Wno-unused-variable -Wno-unknown-pragmas -std=c++14 -I$(TNL_HEADERS)
		@@ -31,10 +32,10 @@ else
		endif

		# Set-up CUDA_CXXFLAGS
		CUDA_CXXFLAGS =-Wno-deprecated-gpu-targets --expt-relaxed-constexpr --expt-extended-lambda -DHAVE_CUDA -DHAVE_MPI --std c++14 -I$(TNL_HEADERS)
		CUDA_CXXFLAGS = -Wno-deprecated-gpu-targets --expt-relaxed-constexpr --expt-extended-lambda --std c++14 -I$(TNL_HEADERS)

		ifeq ($(WITH_CUDA), yes)
		CUDA_CXXFLAGS += -DHAVE_CUDA -DHAVE_MPI
		CUDA_CXXFLAGS += -DHAVE_CUDA
		ifeq ($(CUDA_ARCH), auto)
		CUDA_CXXFLAGS += `tnl-cuda-arch`
		else
		@@ -42,11 +43,10 @@ ifeq ( $(WITH_CUDA), yes )
		endif
		endif


		# Set-up MPI_FLAG
		# Set-up MPI_FLAGT
		ifeq ($(WITH_MPI), yes)
		MPI_FLAGT =-L/home/maty/.openmpi/lib -lmpi
		MPI_FLAGS =-I/home/maty/.openmpi/include
		MPI_FLAGT += -L/home/maty/.openmpi/lib -lmpi
		MPI_FLAGS += -I/home/maty/.openmpi/include -DHAVE_MPI
		endif

		# Set-up CPPFLAGS

src/Benchmarks/GEM/gem/GEMdevice.h

+133 −170

Original line number	Diff line number	Diff line
		@@ -58,6 +58,8 @@ void calculResultVector( Matrix< Real, TNL::Devices::Cuda, Index >& matrix,
		}
		}
		//std::cout << processID << ": " << x << std::endl;
		#else
		x = device_vector;
		#endif
		cudaFree( ( void* ) devMat );
		TNL_CHECK_CUDA_DEVICE;
		@@ -71,7 +73,8 @@ template < typename Real,
		typename Index >
		bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting, int verbose )
		{
		Matrix< Real, TNL::Devices::Cuda, Index >* devMat;// = TNL::Cuda::passToDevice( this->A );
		// Copy matrix A and vector b to GPU
		Matrix< Real, TNL::Devices::Cuda, Index >* devMat;
		cudaMalloc( ( void** ) &devMat, ( size_t ) sizeof( Matrix< Real, TNL::Devices::Cuda, Index > ) );
		cudaMemcpy( ( void* ) devMat,( void* ) &this->A, sizeof( Matrix< Real, TNL::Devices::Cuda, Index > ), cudaMemcpyHostToDevice );
		TNL_CHECK_CUDA_DEVICE;
		@@ -79,15 +82,8 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting

		// FOR PIVOTING SET VARIABLES ON DEVICE
		int* pivot; cudaMalloc(&pivot, sizeof(int));
		int* pom = (int)malloc(sizeof(int));// pom = -1;

		if( verbose > 2 )
		{
		showMatrix<<< 1, 1 >>>( this->A );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		printf("\n");
		}
		// Initialise MPI variables even without MPI
		int processID=0;
		int numOfProcesses=1;

		@@ -95,72 +91,86 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting
		MPI_Comm_rank( MPI_COMM_WORLD, &processID );
		MPI_Comm_size( MPI_COMM_WORLD, &numOfProcesses );
		#endif
		Index colPointerMain = 0;

		while( colPointerMain < x.getSize() ){
		#ifdef HAVE_MPI
		TNL::Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD );
		#endif
		Index colPointer = colPointerMain-(Index)(colPointerMain/this->A.getNumRows()) * this->A.getNumRows();
		if( verbose > 2 )
		{
		#ifdef HAVE_MPI
		if( processID == 0 )
		for( int i = 0; i < numOfProcesses; i++ )
		{
		showMatrix<<< 1, 1 >>>( this->A );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		}
		MPI_Barrier(MPI_COMM_WORLD);
		if( processID == 1 )
		if( processID == i )
		{
		showMatrix<<< 1, 1 >>>( this->A );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		}
		MPI_Barrier(MPI_COMM_WORLD);
		if( processID == 2 )
		}
		for( int i = 0; i < numOfProcesses; i++ )
		{
		if( i == processID )
		std::cout << device_vector;
		MPI_Barrier(MPI_COMM_WORLD);
		}
		std::cout << std::endl;
		#else
		showMatrix<<< 1, 1 >>>( this->A );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		printf("\n");
		#endif
		}
		MPI_Barrier(MPI_COMM_WORLD);

		// Main pointer to row, over all parts of matrices, colPointerMain in (0 - number of rows)
		Index colPointerMain = 0;

		// Main cycle for all rows across all MPI parts, vector x is the only one with full size on MPI, or use A.getNumColumns() for rectangular matrices.
		while( colPointerMain < x.getSize() ){
		#ifdef HAVE_MPI
		TNL::Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD );
		#endif
		// Pointer to rows in main process that has colPointerMain.
		Index colPointer = colPointerMain-TNL::floor(colPointerMain/this->A.getNumRows()) * this->A.getNumRows();


		// main row vector for computation (pivoting, non-pivoting)
		TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > mainRowVec;


		// Setting number of threads and blocks for main kernel and for pivoting swapping kernel
		int blockSize = (this->A.getNumColumns()-colPointer) > 1024 ? 1024 : this->A.getNumColumns()-colPointer;
		int numBlocksOnRow = TNL::roundUpDivision( (this->A.getNumColumns()-colPointer), 1024 );
		int numOfBlocks = this->A.getNumRows() * numBlocksOnRow;


		if( pivoting == "yes" )
		{
		// PIVOTING
		// fromRow saves info for each process saying from which row to start looking for pivot
		Index fromRow = 0;
		if( processID < (Index)(colPointerMain/this->A.getNumRows()) )
		if( processID < TNL::floor(colPointerMain/this->A.getNumRows()) )
		fromRow = this->A.getNumRows();
		else if( processID == (Index)(colPointerMain/this->A.getNumRows()) )
		else if( processID == TNL::floor(colPointerMain/this->A.getNumRows()) )
		fromRow = colPointer;

		// Inicialising vectors for maximum and position (vectors important for multiple blocks, otherwise its vector with length 1)
		TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > outMax(1);
		TNL::Containers::Vector< Index, TNL::Devices::Cuda, Index > outPos(1);
		outMax.setValue(0); outPos.setValue(-1);


		// those blocks that have rows to look for pivot in should start looking
		if( fromRow != this->A.getNumRows() )
		{
		// setting size for kernel of pivoting
		int reduceBlockSize = (this->A.getNumRows()-fromRow) > 1024 ? 1024 :
		TNL::roundToMultiple( this->A.getNumRows()-fromRow, 32 );
		int reduceGridSize = TNL::roundUpDivision( this->A.getNumRows()-fromRow, reduceBlockSize );
		int reduceGridSizeRound = TNL::roundToMultiple( reduceGridSize, 32 );

		printf("%d,%d: reduceBlockSize = %d, reduceGridSize = %d, reduceGridSizeRound = %d %d\n",
		colPointerMain, processID, reduceBlockSize, reduceGridSize, reduceGridSizeRound, fromRow );

		// resizing outMax and outPos for the kernel of pivoting
		outMax.setSize( reduceGridSize );
		outPos.setSize( reduceGridSize );
		//outMax.setValue(0); outPos.setValue(0);

		// two main pivoting kernels executes and saves max and position into 0-th element of vectors outMax and outPos
		findPivot<<< reduceGridSize, reduceBlockSize >>>( devMat, fromRow, colPointerMain, outMax.getView(), outPos.getView() );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		@@ -168,11 +178,13 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting
		findRowPivot<<< 1, reduceGridSizeRound >>>( outMax.getView(), outPos.getView(), pivot );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		//*pom = 0;
		//cudaMemcpy( pom, pivot, sizeof(int), cudaMemcpyDeviceToHost);
		}
		//std::cout << processID << ": " << outMax << outPos << std::endl;
		// Now each process has info about maximum in the "active" part of matrix that they calculate with

		// Preparing dynamic arrays for MPI send and recv resp. AllGather.
		// data stores information to send from each process
		// recvData stores information that is received
		#ifdef HAVE_MPI
		Real data, recvData;
		data = new Real[2];
		recvData = new Real[2*numOfProcesses];
		@@ -181,80 +193,48 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting

		MPI_Allgather( data, 2, TNL::Communicators::MPITypeResolver< Real >::getType(),
		recvData, 2, TNL::Communicators::MPITypeResolver< Real >::getType(), MPI_COMM_WORLD);
		#endif

		/*MPI_Barrier( MPI_COMM_WORLD );
		if( processID == 0 )
		{
		printf( "%d:", processID );
		for( int i = 0; i < 2*numOfProcesses; i++ )
		printf( "%.2f ", recvData[i] );
		printf("\n");
		}
		MPI_Barrier( MPI_COMM_WORLD );
		if( processID == 1 )
		{
		printf( "%d:", processID );
		for( int i = 0; i < 2*numOfProcesses; i++ )
		printf( "%.2f ", recvData[i] );
		printf("\n");
		}*/

		TNL::Containers::Vector< Real, TNL::Devices::Host, Index > outMaxHost( numOfProcesses );
		TNL::Containers::Vector< Index, TNL::Devices::Host, Index > outPosHost( numOfProcesses );
		// Initialising maximum and possition and process that has the overall maximum ( ?: for non-MPI program)
		Index ProcessMax = numOfProcesses != 1 ? -1 : 0;
		Index Possition = numOfProcesses != 1 ? 0 : outPos.getElement(0);
		Real Maximum = numOfProcesses != 1 ? 0 : outMax.getElement(0);

		for( int i = 0; i < numOfProcesses; i++ )
		#ifdef HAVE_MPI
		// clasic maximum finding + storing processMax and possition
		for( int i = 0; i < 2*numOfProcesses; i = i+2 )
		{
		if( recvData[ i+1 ] != -1 )
		if( Maximum < recvData[ i ] )
		{
		outMaxHost[ i ] = (Real)recvData[ 2*i ];
		outPosHost[ i ] = (Index)recvData[ 2*i+1 ];
		Maximum = recvData[ i ];
		ProcessMax = TNL::floor( i/2 );
		Possition = recvData[ i+1 ];
		}
		}
		if( verbose > 1 && processID == 0 )
		printf("%d: max = %.2f, possition = %d, process = %d\n", colPointerMain, Maximum, Possition, ProcessMax );
		// All processes has the info in Maximum, ProcesMax and Possition. So deleting arrays.
		delete []data;
		delete []recvData;


		/*if( processID == 0 )
		{
		printf( "%d:", processID );
		std::cout << outMaxHost << std::endl;
		std::cout << outPosHost << std::endl;
		}
		MPI_Barrier( MPI_COMM_WORLD );
		if( processID == 1 )
		{
		printf( "%d: ", processID );
		std::cout << outMaxHost << std::endl;
		std::cout << outPosHost << std::endl;
		}*/

		Index ProcessMax = -1;
		Index Possition = -1;
		Real Maximum = 0;

		for( int i = 0; i < numOfProcesses; i++ )
		{
		if( outPosHost[ i ] != -1 )
		if( Maximum < outMaxHost[ i ] )
		#endif
		// Clasic Maximum == 0 then we occured zero pivot so ending this calculation
		if( Maximum == 0 )
		{
		Maximum = outMaxHost[ i ];
		ProcessMax = i;
		Possition = outPosHost[ i ];
		}
		std::cout << "Ooops zero pivot occured in " << colPointerMain << "-th step." << std::endl;
		return false;
		}

		/*if( processID == 0 )
		{
		printf( "%d: ", processID );
		std::cout << ProcessMax << " " << Maximum << " " << Possition << std::endl;
		}
		MPI_Barrier( MPI_COMM_WORLD );
		if( processID == 1 )
		{
		printf( "%d: ", processID );
		std::cout << ProcessMax << " " << Maximum << " " << Possition << std::endl;
		}*/
		// Now when every process has the ProcessMax of pivoting row across all processes
		// we can send pivoting row to all processes from ProcessMax
		// mainRow stores pivoting row
		Real* mainRow;
		int size = this->A.getNumColumns() - colPointerMain + 1;
		mainRow = new Real[size];

		// If ProcessMax isn't the main process that contains colPointerMain then ProcessMax sets mainRow itself.
		// else means that ProcessMax is in the main process that contains colPointerMain, in this case we need to do normal pivoting
		// so it swaps rows and fills mainRow normally.
		if( ProcessMax != colPointerMain/this->A.getNumRows() )
		{
		if( processID == ProcessMax )
		@@ -280,24 +260,22 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting
		mainRow[ size-1 ] = this->b.getElement( colPointer );
		}
		}
		TNL::Communicators::MpiCommunicator::Bcast( mainRow, size, colPointerMain/this->A.getNumRows(), MPI_COMM_WORLD);

		TNL::Containers::Vector< Real, TNL::Devices::Host, Index > mainRowVecHost( mainRow, size );
		mainRowVec = mainRowVecHost;
		delete []mainRow;
		// Broad casting the pivoting row to all processes
		#ifdef HAVE_MPI
		TNL::Communicators::MpiCommunicator::Bcast( mainRow, size, ProcessMax, MPI_COMM_WORLD);

		// Onec more if the ProcessMax filled the mainRow, then the ProcessMax needs to switch this pivoting row with main process.
		// mainRowSwap is the colPointer of process colPointerMain/this->A.getNumRows()
		if( ProcessMax != colPointerMain/this->A.getNumRows() )
		{
		Real *mainRowSwap;
		mainRowSwap = new Real[size];


		if( processID == ProcessMax )
		{
		TNL::Communicators::MpiCommunicator::Recv( mainRowSwap, size, colPointerMain/this->A.getNumRows(), 0 );
		printf( "%d: ", processID );
		for( int i = 0; i < size; i++ )
		printf( "%.2f ", mainRowSwap[ i ] );
		printf( "\n" );
		this->A.setRow( Possition, colPointerMain, mainRowSwap, size );
		this->b.setElement( Possition, mainRowSwap[ size-1 ] );
		}
		@@ -307,48 +285,18 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting
		mainRowSwap[ size-1 ] = this->b.getElement( colPointer );

		TNL::Communicators::MpiCommunicator::ISend( mainRowSwap, size, ProcessMax, 0 );
		printf( "%d: ", processID );
		for( int i = 0; i < size; i++ )
		printf( "%.2f ", mainRow[ i ] );
		printf( "\n" );
		this->A.setRow( colPointer, colPointerMain, mainRow, size );
		this->b.setElement( colPointer, mainRow[ size-1 ] );
		}
		delete []mainRowSwap;
		}
		#ifdef HAVE_MPI
		if( processID == 0 )
		{
		showMatrix<<< 1, 1 >>>( this->A );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		}
		MPI_Barrier(MPI_COMM_WORLD);
		if( processID == 1 )
		{
		showMatrix<<< 1, 1 >>>( this->A );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		}
		MPI_Barrier(MPI_COMM_WORLD);
		if( processID == 2 )
		{
		showMatrix<<< 1, 1 >>>( this->A );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		}
		MPI_Barrier(MPI_COMM_WORLD);
		if( processID == 0 )
		std::cout << this->b << std::endl;
		MPI_Barrier(MPI_COMM_WORLD);
		if( processID == 1 )
		std::cout << this->b << std::endl;
		MPI_Barrier(MPI_COMM_WORLD);
		if( processID == 2 )
		std::cout << this->b << std::endl;
		#endif
		// Main kernel works with vector as a main row, so all processes has to set mainRowVec.
		TNL::Containers::Vector< Real, TNL::Devices::Host, Index > mainRowVecHost( mainRow, size );
		mainRowVec = mainRowVecHost;
		delete []mainRow;
		}
		else
		else // without pivoting
		{
		#ifdef HAVE_MPI
		Real* mainRow;
		@@ -358,14 +306,10 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting
		if( colPointerMain/this->A.getNumRows() == processID ){
		this->A.getRow( colPointer, colPointerMain, mainRow, size );
		mainRow[ size-1 ] = this->b.getElement( colPointer );
		}

		/*for( int i = 0; i < numOfProcesses; i++ )
		if( i != processID ){
		TNL::Communicators::MpiCommunicator::ISend( mainRow, size, i, 0 );
		}*/
		} //else {
		//TNL::Communicators::MpiCommunicator::Recv( mainRow, size, colPointerMain/this->A.getNumRows(), 0 );
		TNL::Communicators::MpiCommunicator::Bcast( mainRow, size, colPointerMain/this->A.getNumRows(), MPI_COMM_WORLD);

		if( verbose > 2 )
		{
		printf( "%d: [", processID);
		@@ -374,23 +318,18 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting
		printf("]\n");
		}

		//}
		TNL::Containers::Vector< Real, TNL::Devices::Host, Index > mainRowVecHost( mainRow, size );
		mainRowVec = mainRowVecHost;
		delete []mainRow;
		#else
		//printf("%d: colPointer = %d, colPointerMain = %d\n", processID, colPointer, colPointerMain );
		this->A.getRow(colPointer, colPointerMain, mainRowVec );
		#endif

		}
		/*std::cout << processID << ": " << std::endl;
		std::cout << mainRowVec << std::endl;*/

		if( verbose > 1 )
		printf( "Elimination: %d/%d\n", colPointer, this->A.getNumColumns() );


		// Finally main kernel calculates the GEM for colPointerMain from mainRowVec
		GEMmainKernel<<< numOfBlocks, blockSize >>>( devMat,
		device_vector.getView(),
		mainRowVec.getView(),
		@@ -401,23 +340,47 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting
		numOfProcesses );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;


		if( verbose > 2 )
		{
		#ifdef HAVE_MPI
		for( int i = 0; i < numOfProcesses; i++ )
		{
		if( processID == i )
		{
		showMatrix<<< 1, 1 >>>( this->A );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		}
		MPI_Barrier(MPI_COMM_WORLD);
		}
		for( int i = 0; i < numOfProcesses; i++ )
		{
		if( i == processID )
		std::cout << device_vector;
		MPI_Barrier(MPI_COMM_WORLD);
		}
		std::cout << std::endl;
		#else
		showMatrix<<< 1, 1 >>>( this->A );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		std::cout << this->b << std::endl;
		printf("\n");
		#endif
		}
		//TNL::Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD );
		// increment colPointerMain for next while passage
		colPointerMain++;
		}

		// delete all used variables
		cudaFree(pivot);
		free(pom);
		cudaFree( devMat );
		TNL_CHECK_CUDA_DEVICE;

		// Calculate real result
		// (With MPI needs to send info into process 0 as main process with real result, rest processes has result as vector of zeros)
		calculResultVector( this->A, device_vector, x, processID, numOfProcesses );

		return true;

src/Benchmarks/GEM/main.h

+12 −5

Original line number	Diff line number	Diff line
		@@ -46,8 +46,16 @@ int main( int argc, char* argv[] )
		MPI_Init(NULL, NULL);

		// Get the number of processes
		int processID;
		int processID = -1;
		MPI_Comm_rank(MPI_COMM_WORLD, &processID);
		int numOfProcesses = 0;
		MPI_Comm_size( MPI_COMM_WORLD, &numOfProcesses );
		int numOfDevices = 0;
		cudaGetDeviceCount(&numOfDevices);
		if( numOfProcesses > numOfDevices && processID == 0 )
		printf("Warning: There is too many processes for computation,"
		" some processes will compute on same device!. (Processes %d, Devices %d)\n", numOfProcesses, numOfDevices);
		cudaSetDevice(processID%numOfDevices);
		#endif


		@@ -85,10 +93,6 @@ int main( int argc, char* argv[] )
		auto result =
		runGEM< float, int, TNL::Devices::Cuda >( matrixName, vectorName, loops, verbose, (String)"GPU", pivoting );
		}
		#ifdef HAVE_MPI
		//printf("%d: about to finalize.\n", processID );
		MPI_Finalize();
		#endif

		if( ( precision == "all" \|\| precision == "double" ) )
		{
		@@ -100,6 +104,9 @@ int main( int argc, char* argv[] )
		runGEM< double, int, TNL::Devices::Cuda >( matrixName, vectorName, loops, verbose, (String)"GPU", pivoting );
		}

		#ifdef HAVE_MPI
		MPI_Finalize();
		#endif

		return EXIT_SUCCESS;
		}
		No newline at end of file