Went back in commits for old GPU code. Now MPI turns on MPI code and without... (111cd841) · Commits · TNL / tnl-dev

src/Benchmarks/GEM/Makefile.inc

+1 −1

Original line number	Diff line number	Diff line
		@@ -5,7 +5,7 @@ PROJECT_NAME = tnl-gem
		TNL_HEADERS = ${HOME}/.local/include
		INSTALL_DIR = ${HOME}/.local
		WITH_CUDA = yes
		WITH_MPI = yes
		WITH_MPI = no
		WITH_OPENMP = yes
		WITH_DEBUG = yes
		MPI_FLAGT =

src/Benchmarks/GEM/gem.h

+1 −3

Original line number	Diff line number	Diff line
		@@ -74,9 +74,6 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve

		if( verbose > 1 )
		cout << "Starting computation on " << device << endl;
		/*char hostname[256];
		gethostname(hostname, sizeof(hostname));
		printf("PID %d on %s ready for attach\n", getpid(), hostname);*/

		for( int i = 0; i < loops; i++ )
		{
		@@ -98,6 +95,7 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve
		cout << "starting computation number " << i+1 << endl;

		gem.solve( vectorResult, pivoting, verbose );

		duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC;

src/Benchmarks/GEM/gem/GEM.h

+4 −0

Original line number	Diff line number	Diff line
		@@ -34,6 +34,10 @@ class GEM

		bool computeLUDecomposition( int verbose = 0 );

		#ifdef HAVE_MPI
		bool GEMdeviceMPI( Array& x, const TNL::String& pivoting, int verbose );
		#endif

		bool GEMdevice( Array& x, const TNL::String& pivoting, int verbose );

		bool setMatrixVector( MatrixGEM& A, Array& b ){

src/Benchmarks/GEM/gem/GEM_impl.h

+11 −1

Original line number	Diff line number	Diff line
		@@ -13,7 +13,11 @@
		#pragma once

		#include <TNL/Assert.h>
		#ifdef HAVE_MPI
		#include "GEMdeviceMPI.h"
		#else
		#include "GEMdevice.h"
		#endif
		#include "GEM.h"
		#include <fstream>

		@@ -295,9 +299,15 @@ class GEMDeviceDependentCode< TNL::Devices::Cuda >
		TNL::Containers::Vector< Real, DeviceType, Index >& x, const TNL::String& pivoting, int verbose )
		{
		#ifdef HAVE_CUDA
		#ifdef HAVE_MPI
		if( verbose > 1 )
		printf("Starting the computation SolveGEM.\n");
		printf("Starting the computation SolveGEM MPI.\n");
		gem.GEMdeviceMPI( x, pivoting, verbose );
		#else
		if( verbose > 1 )
		printf("Starting the computation SolveGEM MPI.\n");
		gem.GEMdevice( x, pivoting, verbose );
		#endif
		#endif
		return true;
		}

src/Benchmarks/GEM/gem/GEMdevice.h

+55 −376

Original line number	Diff line number	Diff line
		#include "TNL/Cuda/MemoryHelpers.h"
		#define DEBUG 0
		#include <fstream> // saving and loading vector.txt
		#include <string> // input from cmd

		template < typename Real, typename Index >
		void saveVec( Real* mainRow, Index size, int processID, Index colPointerMain )
		{
		std::ofstream outdata; // outdata is like cin
		std::string s( "./test-matrices/mainRow" );
		s = s + std::to_string(processID) + "_" + std::to_string(colPointerMain);
		outdata.open(s); // opens the file
		if( !outdata ) { // file couldn't be opened
		std::cerr << "Error: file could not be opened" << std::endl;
		exit(1);
		}

		for( int i = 0; i < size; i++ )
		{
		outdata << mainRow[ i ] << std::endl;
		}
		outdata.close();
		}

		#ifdef HAVE_CUDA
		#include "GEMkernels.h"


		#ifdef HAVE_MPI
		#include "TNL/Communicators/MpiCommunicator.h"
		#include "TNL/Communicators/MPITypeResolver.h"
		#include <mpi.h>
		#endif

		template < typename Real,
		typename Index >
		void calculResultVector( Matrix< Real, TNL::Devices::Cuda, Index >& matrix,
		TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& device_vector,
		TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& x,
		int processID, int numOfProcesses )
		TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& x )
		{
		Matrix< Real, TNL::Devices::Cuda, Index >* devMat;// = TNL::Cuda::passToDevice( this->A );
		cudaMalloc( ( void** ) &devMat, ( size_t ) sizeof( Matrix< Real, TNL::Devices::Cuda, Index > ) );
		cudaMemcpy( ( void* ) devMat,( void* ) &matrix, sizeof( Matrix< Real, TNL::Devices::Cuda, Index > ), cudaMemcpyHostToDevice );
		TNL_CHECK_CUDA_DEVICE;
		Matrix< Real, TNL::Devices::Cuda, Index >* devMat = TNL::Cuda::passToDevice( matrix);

		int blockSize = matrix.getNumRows() > 1024 ? 1024 : matrix.getNumRows();
		int blockSize = matrix.getNumRows() > 1024 ? 1024 : matrix.getNumColumns();
		int numBlocksOnColumn = TNL::roundUpDivision( matrix.getNumRows(), 1024 );
		int numOfBlocks = matrix.getNumRows() * numBlocksOnColumn;


		GEMDiagToResult<<< numOfBlocks, blockSize >>>( devMat,device_vector.getView(), processID );
		GEMDiagToResult<<< numOfBlocks, blockSize >>>( devMat,device_vector.getView(), x.getView() );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		#ifdef HAVE_MPI
		if( processID != 0 )
		{
		TNL::Containers::Vector< Real, TNL::Devices::Host, Index > host_vector;
		host_vector = device_vector;
		Real* data = host_vector.getData();

		if(processID*device_vector.getSize() < x.getSize() ){
		TNL::Communicators::MpiCommunicator::ISend( data, device_vector.getSize(), 0, 0 );
		}

		}else{
		for( int j = 0; j < device_vector.getSize(); j++ )
		x.setElement( j, device_vector.getElement( j ) );
		for( int i = 1; i < numOfProcesses && i*device_vector.getSize() < x.getSize(); i++ )
		{
		Real* data;
		data = new Real[ device_vector.getSize() ];
		TNL::Communicators::MpiCommunicator::Recv( data, device_vector.getSize(), i, 0 );

		for( int j = 0; j < device_vector.getSize() && i * device_vector.getSize() + j < x.getSize(); j++ )
		x.setElement( i * device_vector.getSize() + j, data[ j ] );
		delete []data;
		}
		}
		//std::cout << processID << ": " << x << std::endl;
		#else
		x = device_vector;
		#endif
		cudaFree( ( void* ) devMat );
		TNL_CHECK_CUDA_DEVICE;
		}


		@@ -92,345 +32,84 @@ template < typename Real,
		typename Index >
		bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting, int verbose )
		{
		// Copy matrix A and vector b to GPU
		Matrix< Real, TNL::Devices::Cuda, Index >* devMat;
		cudaMalloc( ( void** ) &devMat, ( size_t ) sizeof( Matrix< Real, TNL::Devices::Cuda, Index > ) );
		cudaMemcpy( ( void* ) devMat,( void* ) &this->A, sizeof( Matrix< Real, TNL::Devices::Cuda, Index > ), cudaMemcpyHostToDevice );
		TNL_CHECK_CUDA_DEVICE;
		Matrix< Real, TNL::Devices::Cuda, Index >* devMat = TNL::Cuda::passToDevice( this->A );
		TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& device_vector( this->b );

		// FOR PIVOTING SET VARIABLES ON DEVICE
		int* pivot; cudaMalloc(&pivot, sizeof(int));

		// Initialise MPI variables even without MPI
		int processID=0;
		int numOfProcesses=1;

		#ifdef HAVE_MPI
		MPI_Comm_rank( MPI_COMM_WORLD, &processID );
		MPI_Comm_size( MPI_COMM_WORLD, &numOfProcesses );
		#endif
		size_t size = sizeof(int);
		int* pivot; cudaMalloc(&pivot, size);
		int* pom = (int)malloc(size); pom = -1;

		if( verbose > 2 )
		{
		#ifdef HAVE_MPI
		for( int i = 0; i < numOfProcesses; i++ )
		{
		if( processID == i )
		{
		showMatrix<<< 1, 1 >>>( this->A );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		}
		MPI_Barrier(MPI_COMM_WORLD);
		}
		for( int i = 0; i < numOfProcesses; i++ )
		{
		if( i == processID )
		std::cout << device_vector;
		MPI_Barrier(MPI_COMM_WORLD);
		}
		std::cout << std::endl;
		#else
		showMatrix<<< 1, 1 >>>( this->A );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		printf("\n");
		#endif
		}

		// Main pointer to row, over all parts of matrices, colPointerMain in (0 - number of rows)
		Index colPointerMain = 0;

		// Main cycle for all rows across all MPI parts, vector x is the only one with full size on MPI, or use A.getNumColumns() for rectangular matrices.
		while( colPointerMain < x.getSize() ){
		#ifdef HAVE_MPI
		TNL::Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD );
		#endif
		// Pointer to rows in main process that has colPointerMain.
		Index colPointer = colPointerMain-TNL::floor(colPointerMain/this->A.getNumRows()) * this->A.getNumRows();


		// main row vector for computation (pivoting, non-pivoting)
		TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > mainRowVec( this->A.getNumColumns() - colPointerMain + 1 );

		// Setting number of threads and blocks for main kernel and for pivoting swapping kernel
		int blockSize = (this->A.getNumColumns()-colPointer) > 1024 ? 1024 : this->A.getNumColumns()-colPointer;
		int numBlocksOnRow = TNL::roundUpDivision( (this->A.getNumColumns()-colPointer), 1024 );
		int numOfBlocks = this->A.getNumRows() * numBlocksOnRow;

		for( int colPointer = 0; colPointer < this->A.getNumColumns(); colPointer++ )
		{
		if( verbose > 1 )
		printf( "Elimination: %d/%d\n", colPointer, this->A.getNumColumns() );

		if( pivoting == "yes" )
		{
		// PIVOTING
		// fromRow saves info for each process saying from which row to start looking for pivot
		Index fromRow = 0;
		if( processID < TNL::floor(colPointerMain/this->A.getNumRows()) )
		fromRow = this->A.getNumRows();
		else if( processID == TNL::floor(colPointerMain/this->A.getNumRows()) )
		fromRow = colPointer;

		// Inicialising vectors for maximum and position (vectors important for multiple blocks, otherwise its vector with length 1)
		TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > outMax(1);
		TNL::Containers::Vector< Index, TNL::Devices::Cuda, Index > outPos(1);
		outMax.setValue(0); outPos.setValue(-1);


		// those blocks that have rows to look for pivot in should start looking
		if( fromRow != this->A.getNumRows() )
		{
		// setting size for kernel of pivoting
		int reduceBlockSize = (this->A.getNumRows()-fromRow) > 1024 ? 1024 :
		TNL::roundToMultiple( this->A.getNumRows()-fromRow, 32 );
		int reduceGridSize = TNL::roundUpDivision( this->A.getNumRows()-fromRow, reduceBlockSize );
		int reduceBlockSize = (this->A.getNumColumns()-colPointer) > 1024 ? 1024 :
		TNL::roundToMultiple( this->A.getNumColumns()-colPointer, 32 );
		int reduceGridSize = TNL::roundUpDivision( this->A.getNumColumns()-colPointer, reduceBlockSize );
		int reduceGridSizeRound = TNL::roundToMultiple( reduceGridSize, 32 );
		//printf("%d, %d, %d\n", reduceBlockSize, reduceGridSize, reduceGridSizeRound );

		// resizing outMax and outPos for the kernel of pivoting
		outMax.setSize( reduceGridSize );
		outPos.setSize( reduceGridSize );
		TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > outMax(reduceGridSize);
		TNL::Containers::Vector< Index, TNL::Devices::Cuda, Index > outPos(reduceGridSize);
		//outMax.setValue(0); outPos.setValue(0);

		// two main pivoting kernels executes and saves max and position into 0-th element of vectors outMax and outPos
		findPivot<<< reduceGridSize, reduceBlockSize >>>( devMat, fromRow, colPointerMain, outMax.getView(), outPos.getView() );
		findPivot<<< reduceGridSize, reduceBlockSize >>>( devMat, colPointer, outMax.getView(), outPos.getView() );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;

		findRowPivot<<< 1, reduceGridSizeRound >>>( outMax.getView(), outPos.getView(), pivot );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		*pom = 0;
		cudaMemcpy( pom, pivot, size, cudaMemcpyDeviceToHost);
		}
		// Now each process has info about maximum in the "active" part of matrix that they calculate with

		// Preparing dynamic arrays for MPI send and recv resp. AllGather.
		// data stores information to send from each process
		// recvData stores information that is received
		#ifdef HAVE_MPI
		Real data, recvData;
		data = new Real[2];
		recvData = new Real[2*numOfProcesses];
		data[0] = outMax.getElement(0); data[1] = outPos.getElement(0);
		MPI_Barrier( MPI_COMM_WORLD );

		MPI_Allgather( data, 2, TNL::Communicators::MPITypeResolver< Real >::getType(),
		recvData, 2, TNL::Communicators::MPITypeResolver< Real >::getType(), MPI_COMM_WORLD);
		#endif

		// Initialising maximum and possition and process that has the overall maximum ( ?: for non-MPI program)
		Index ProcessMax = numOfProcesses != 1 ? -1 : 0;
		Index Possition = numOfProcesses != 1 ? 0 : outPos.getElement(0);
		Real Maximum = numOfProcesses != 1 ? 0 : outMax.getElement(0);

		#ifdef HAVE_MPI
		// clasic maximum finding + storing processMax and possition
		for( int i = 0; i < 2*numOfProcesses; i = i+2 )
		{
		if( recvData[ i+1 ] != -1 )
		if( Maximum < recvData[ i ] )
		{
		Maximum = recvData[ i ];
		ProcessMax = TNL::floor( i/2 );
		Possition = recvData[ i+1 ];
		}
		}
		if( verbose > 1 && processID == 0 )
		printf("%d: max = %.2f, possition = %d, process = %d\n", colPointerMain, Maximum, Possition, ProcessMax );
		// All processes has the info in Maximum, ProcesMax and Possition. So deleting arrays.
		delete []data;
		delete []recvData;
		#endif
		// Clasic Maximum == 0 then we occured zero pivot so ending this calculation
		if( Maximum == 0 )
		{
		std::cout << "Ooops zero pivot occured in " << colPointerMain << "-th step." << std::endl;
		return false;
		}

		// Now when every process has the ProcessMax of pivoting row across all processes
		// we can send pivoting row to all processes from ProcessMax
		// mainRow stores pivoting row
		Real* mainRow;
		int size = this->A.getNumColumns() - colPointerMain + 1;
		mainRow = new Real[size];
		int blockSize = (this->A.getNumColumns()-colPointer) > 1024 ? 1024 : this->A.getNumColumns()-colPointer;
		int numBlocksOnRow = TNL::roundUpDivision( (this->A.getNumColumns()-colPointer), 1024 );
		int numOfBlocks = this->A.getNumRows() * numBlocksOnRow;


		// If ProcessMax isn't the main process that contains colPointerMain then ProcessMax sets mainRow itself.
		// else means that ProcessMax is in the main process that contains colPointerMain, in this case we need to do normal pivoting
		// so it swaps rows and fills mainRow normally.
		if( ProcessMax != colPointerMain/this->A.getNumRows() )
		{
		if( processID == ProcessMax )
		{
		this->A.getRow( Possition, colPointerMain, mainRow, size );
		mainRow[ size-1 ] = this->b.getElement( Possition );
		}
		} else {
		if( colPointerMain/this->A.getNumRows() == processID ){
		if( Possition != colPointer )
		if( pivoting == "yes" )// && pom != -1 && pom != colPointer )
		{
		if( verbose > 1 )
		{
		std::cout << std::endl;
		std::cout << "Choosing element at " << Possition << "-th row as pivot with value..." << std::endl;
		std::cout << "Swapping " << colPointer << "-th and " << Possition << "-th rows ... " << std::endl;
		std::cout << "Choosing element at " << *pom << "-th row as pivot with value..." << std::endl;
		std::cout << "Swapping " << colPointer << "-th and " << *pom << "-th rows ... " << std::endl;
		}
		swapRows<<< numBlocksOnRow, blockSize >>>( devMat, device_vector.getView(), colPointer, numBlocksOnRow, Possition );
		swapRows<<< numBlocksOnRow, blockSize >>>( devMat, device_vector.getView(), colPointer, numBlocksOnRow, pivot );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		}

		this->A.getRow( colPointer, colPointerMain, mainRow, size );
		mainRow[ size-1 ] = this->b.getElement( colPointer );
		}
		}

		// Broad casting the pivoting row to all processes
		#ifdef HAVE_MPI
		MPI_Barrier(MPI_COMM_WORLD);
		TNL::Communicators::MpiCommunicator::Bcast( mainRow, size, ProcessMax, MPI_COMM_WORLD);
		//if( colPointerMain%100 == 0 )
		// saveVec( mainRow, size, processID, colPointerMain );
		if( verbose > 1 )
		{
		printf( "%d: [", processID);
		for( int i = 0; i < size; i++ )
		printf( "%.2f ", mainRow[ i ] );
		printf("]\n");
		}


		// Onec more if the ProcessMax filled the mainRow, then the ProcessMax needs to switch this pivoting row with main process.
		// mainRowSwap is the colPointer of process colPointerMain/this->A.getNumRows()
		if( ProcessMax != colPointerMain/this->A.getNumRows() )
		{
		Real *mainRowSwap;
		mainRowSwap = new Real[size];


		if( processID == ProcessMax )
		{
		TNL::Communicators::MpiCommunicator::Recv( mainRowSwap, size, colPointerMain/this->A.getNumRows(), 0 );
		this->A.setRow( Possition, colPointerMain, mainRowSwap, size );
		this->b.setElement( Possition, mainRowSwap[ size-1 ] );
		}
		else if( processID == colPointerMain/this->A.getNumRows() )
		{
		this->A.getRow( colPointer, colPointerMain, mainRowSwap, size );
		mainRowSwap[ size-1 ] = this->b.getElement( colPointer );

		TNL::Communicators::MpiCommunicator::Send( mainRowSwap, size, ProcessMax, 0 );
		this->A.setRow( colPointer, colPointerMain, mainRow, size );
		this->b.setElement( colPointer, mainRow[ size-1 ] );
		}
		delete []mainRowSwap;
		}
		#endif
		// Main kernel works with vector as a main row, so all processes has to set mainRowVec.
		TNL::Containers::Vector< Real, TNL::Devices::Host, Index > mainRowVecHost( mainRow, size );
		mainRowVec = mainRowVecHost;
		delete []mainRow;
		}
		else // without pivoting
		{
		#ifdef HAVE_MPI
		Real* mainRow;
		int size = this->A.getNumColumns() - colPointerMain + 1;
		mainRow = new Real[size];

		if( colPointerMain/this->A.getNumRows() == processID ){
		this->A.getRow( colPointer, colPointerMain, mainRow, size );
		mainRow[ size-1 ] = this->b.getElement( colPointer );
		}

		TNL::Communicators::MpiCommunicator::Bcast( mainRow, size, colPointerMain/this->A.getNumRows(), MPI_COMM_WORLD);

		if( verbose > 2 )
		{
		printf( "%d: [", processID);
		for( int i = 0; i < size; i++ )
		printf( "%.2f ", mainRow[ i ] );
		printf("]\n");
		}

		TNL::Containers::Vector< Real, TNL::Devices::Host, Index > mainRowVecHost( mainRow, size );
		mainRowVec = mainRowVecHost;
		delete []mainRow;
		#else
		this->A.getRow(colPointer, colPointerMain, mainRowVec );
		mainRowVec.setElement( mainRowVec.getSize() - 1, this->b.getElement( colPointer ) );
		#endif
		}
		if( verbose > 1 )
		{
		#ifdef HAVE_MPI
		for( int i = 0; i < numOfProcesses; i++ )
		{
		if( processID == i )
		{
		std::cout << mainRowVec << std::endl;
		}
		MPI_Barrier(MPI_COMM_WORLD);
		}
		#else
		std::cout << mainRowVec << std::endl;
		#endif
		}
		if( verbose > 1 )
		printf( "Elimination: %d/%d\n", colPointerMain, this->A.getNumColumns() );
		//std::cout << mainRowVec << std::endl;
		// Finally main kernel calculates the GEM for colPointerMain from mainRowVec
		GEMmainKernel<<< numOfBlocks, blockSize >>>( devMat,
		device_vector.getView(),
		mainRowVec.getView(),
		colPointer,
		colPointerMain,
		numBlocksOnRow,
		processID,
		numOfProcesses );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;


		if( verbose > 2 )
		{
		#ifdef HAVE_MPI
		for( int i = 0; i < numOfProcesses; i++ )
		{
		if( processID == i )
		{
		showMatrix<<< 1, 1 >>>( this->A );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		}
		MPI_Barrier(MPI_COMM_WORLD);
		}
		for( int i = 0; i < numOfProcesses; i++ )
		{
		if( i == processID )
		std::cout << device_vector;
		MPI_Barrier(MPI_COMM_WORLD);
		}
		std::cout << std::endl;
		#else
		showMatrix<<< 1, 1 >>>( this->A );
		numBlocksOnRow );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		std::cout << this->b << std::endl;
		printf("\n");
		#endif
		}
		// increment colPointerMain for next while passage
		colPointerMain++;
		}

		// delete all used variables
		cudaFree(pivot);
		cudaFree( devMat );
		TNL_CHECK_CUDA_DEVICE;
		free(pom);

		calculResultVector( this->A, device_vector, x );

		// Calculate real result
		// (With MPI needs to send info into process 0 as main process with real result, rest processes has result as vector of zeros)
		calculResultVector( this->A, device_vector, x, processID, numOfProcesses );
		//if( processID == 0 )
		// std::cout << x << std::endl;
		return true;
		}