Pivoting with MPI done, unreadable code! (7d9b98a6) · Commits · TNL / tnl-dev

src/Benchmarks/GEM/Matrix/Matrix.h

+11 −3

Original line number	Diff line number	Diff line
		@@ -62,15 +62,23 @@ class Matrix
		__cuda_callable__ Real getElement( Index row, Index col ) const;

		/**
		* Returns ROW on row and starting column. Cannot be
		* called from host if the matrix is on device and also cannot be called from
		* device if the matrix is on host.
		* Returns ROW on row and starting column. Can be
		* called from host only.
		*
		* @param row and column, mainRow is array with size to be filled with values.
		* @return void.
		*/
		void getRow( Index row, Index col, Real* mainRow, Index size );

		/**
		* Sets ROW on row and starting column into matrix A. Can be
		* called from host only.
		*
		* @param row and column, mainRow is array with size to be filled with values.
		* @return void.
		*/
		void setRow( Index row, Index col, Real* mainRow, Index size );

		/**
		* Returns ROW on row and starting column. Can be
		* called from host if the matrix is on device and also can be called from

src/Benchmarks/GEM/Matrix/Matrix_impl.h

+24 −0

Original line number	Diff line number	Diff line
		@@ -128,6 +128,30 @@ void Matrix< Real, Device, Index >::getRow( Index row, Index col, Real* mainRow,
		#endif
		}

		template < typename Real,
		typename Device,
		typename Index >
		void Matrix< Real, Device, Index >::setRow( Index row, Index col, Real* mainRow, Index size )
		{
		TNL_ASSERT( row > -1 && col > -1, std::cerr << "Matrix cannot have egative row nor negative column!");
		TNL_ASSERT( row < rows && col < columns, std::cerr << "Matrix dosn't have that much rows or columns!");
		if( std::is_same< Device, TNL::Devices::Host >::value )
		{
		#if DEBUG
		printf("On CPU\n");
		#endif
		for( int i = 0; i < size-1; i++ )
		this->setElement( row, col + i, mainRow[ i ] );
		}
		#ifdef HAVE_CUDA
		if( std::is_same< Device, TNL::Devices::Cuda >::value )
		{
		for( int i = 0; i < size-1; i++ )
		this->data.setElement( row*TNL::roundToMultiple( this->columns, TNL::Cuda::getWarpSize() ) + col + i, mainRow[ i ] );
		}
		#endif
		}

		template < typename Real,
		typename Device,
		typename Index >

src/Benchmarks/GEM/gem.h

+1 −3

Original line number	Diff line number	Diff line
		@@ -18,8 +18,6 @@
		#include <mpi.h>
		#endif

		#include "gem/GEMkernels.h"
		#include "TNL/Cuda/MemoryHelpers.h"
		#define COMPARE_RESULTS true;


		@@ -99,7 +97,7 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve
		if( verbose > 1 && processID == 0 )
		cout << "starting computation number " << i+1 << endl;

		gem.solve( vectorResult, (String)"no", verbose );
		gem.solve( vectorResult, pivoting, verbose );
		duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC;

src/Benchmarks/GEM/gem/GEMdevice.h

+279 −66

Original line number	Diff line number	Diff line
		@@ -3,9 +3,12 @@


		#ifdef HAVE_CUDA
		//#include "GEMkernels.h"
		#include "GEMkernels.h"


		#ifdef HAVE_MPI
		#include "TNL/Communicators/MpiCommunicator.h"
		#include "TNL/Communicators/MPITypeResolver.h"
		#include <mpi.h>
		#endif

		@@ -91,92 +94,302 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting
		#ifdef HAVE_MPI
		MPI_Comm_rank( MPI_COMM_WORLD, &processID );
		MPI_Comm_size( MPI_COMM_WORLD, &numOfProcesses );
		Index colPointerMain = 0;
		#endif
		Index colPointerMain = 0;

		while( colPointerMain < x.getSize() ){
		#ifdef HAVE_MPI
		TNL::Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD );
		Index colPointer = colPointerMain-colPointerMain/this->A.getNumRows() * this->A.getNumRows();
		Real* mainRow;
		int size = this->A.getNumColumns() - colPointerMain + 1;
		mainRow = new Real[size];

		if( colPointerMain/this->A.getNumRows() == processID ){
		this->A.getRow( colPointer, colPointerMain, mainRow, size );
		mainRow[ size-1 ] = this->b.getElement( colPointer );

		for( int i = 0; i < numOfProcesses; i++ )
		if( i != processID ){
		TNL::Communicators::MpiCommunicator::ISend( mainRow, size, i, 0 );
		#endif
		Index colPointer = colPointerMain-(Index)(colPointerMain/this->A.getNumRows()) * this->A.getNumRows();
		#ifdef HAVE_MPI
		if( processID == 0 )
		{
		showMatrix<<< 1, 1 >>>( this->A );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		}
		} else {
		TNL::Communicators::MpiCommunicator::Recv( mainRow, size, colPointerMain/this->A.getNumRows(), 0 );
		if( verbose > 2 )
		MPI_Barrier(MPI_COMM_WORLD);
		if( processID == 1 )
		{
		printf( "%d: [", processID);
		for( int i = 0; i < size; i++ )
		printf( "%.2f ", mainRow[ i ] );
		printf("]\n");
		showMatrix<<< 1, 1 >>>( this->A );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		}

		MPI_Barrier(MPI_COMM_WORLD);
		if( processID == 2 )
		{
		showMatrix<<< 1, 1 >>>( this->A );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		}
		TNL::Containers::Vector< Real, TNL::Devices::Host, Index > mainRowVecHost( mainRow, size );
		TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > mainRowVec;
		mainRowVec = mainRowVecHost;
		delete []mainRow;
		#else
		TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > mainRowVec;
		//printf("%d: colPointer = %d, colPointerMain = %d\n", processID, colPointer, colPointerMain );
		this->A.getRow(colPointer, colPointerMain, mainRowVec );
		MPI_Barrier(MPI_COMM_WORLD);
		#endif
		/*std::cout << processID << ": " << std::endl;
		std::cout << mainRowVec << std::endl;*/

		if( verbose > 1 )
		printf( "Elimination: %d/%d\n", colPointer, this->A.getNumColumns() );
		// main row vector for computation (pivoting, non-pivoting)
		TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > mainRowVec;

		/*if( pivoting == "yes" )

		int blockSize = (this->A.getNumColumns()-colPointer) > 1024 ? 1024 : this->A.getNumColumns()-colPointer;
		int numBlocksOnRow = TNL::roundUpDivision( (this->A.getNumColumns()-colPointer), 1024 );
		int numOfBlocks = this->A.getNumRows() * numBlocksOnRow;

		if( pivoting == "yes" )
		{
		// PIVOTING
		int reduceBlockSize = (this->A.getNumColumns()-colPointer) > 1024 ? 1024 :
		TNL::roundToMultiple( this->A.getNumColumns()-colPointer, 32 );
		int reduceGridSize = TNL::roundUpDivision( this->A.getNumColumns()-colPointer, reduceBlockSize );
		Index fromRow = 0;
		if( processID < (Index)(colPointerMain/this->A.getNumRows()) )
		fromRow = this->A.getNumRows();
		else if( processID == (Index)(colPointerMain/this->A.getNumRows()) )
		fromRow = colPointer;

		TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > outMax(1);
		TNL::Containers::Vector< Index, TNL::Devices::Cuda, Index > outPos(1);
		outMax.setValue(0); outPos.setValue(-1);

		if( fromRow != this->A.getNumRows() )
		{
		int reduceBlockSize = (this->A.getNumRows()-fromRow) > 1024 ? 1024 :
		TNL::roundToMultiple( this->A.getNumRows()-fromRow, 32 );
		int reduceGridSize = TNL::roundUpDivision( this->A.getNumRows()-fromRow, reduceBlockSize );
		int reduceGridSizeRound = TNL::roundToMultiple( reduceGridSize, 32 );
		//printf("%d, %d, %d\n", reduceBlockSize, reduceGridSize, reduceGridSizeRound );

		TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > outMax(reduceGridSize);
		TNL::Containers::Vector< Index, TNL::Devices::Cuda, Index > outPos(reduceGridSize);
		printf("%d,%d: reduceBlockSize = %d, reduceGridSize = %d, reduceGridSizeRound = %d %d\n",
		colPointerMain, processID, reduceBlockSize, reduceGridSize, reduceGridSizeRound, fromRow );

		outMax.setSize( reduceGridSize );
		outPos.setSize( reduceGridSize );
		//outMax.setValue(0); outPos.setValue(0);

		findPivot<<< reduceGridSize, reduceBlockSize >>>( devMat, colPointer, outMax.getView(), outPos.getView() );
		findPivot<<< reduceGridSize, reduceBlockSize >>>( devMat, fromRow, colPointerMain, outMax.getView(), outPos.getView() );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;

		findRowPivot<<< 1, reduceGridSizeRound >>>( outMax.getView(), outPos.getView(), pivot );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		*pom = 0;
		cudaMemcpy( pom, pivot, sizeof(int), cudaMemcpyDeviceToHost);
		//*pom = 0;
		//cudaMemcpy( pom, pivot, sizeof(int), cudaMemcpyDeviceToHost);
		}
		//std::cout << processID << ": " << outMax << outPos << std::endl;

		Real data, recvData;
		data = new Real[2];
		recvData = new Real[2*numOfProcesses];
		data[0] = outMax.getElement(0); data[1] = outPos.getElement(0);
		MPI_Barrier( MPI_COMM_WORLD );

		MPI_Allgather( data, 2, TNL::Communicators::MPITypeResolver< Real >::getType(),
		recvData, 2, TNL::Communicators::MPITypeResolver< Real >::getType(), MPI_COMM_WORLD);

		/*MPI_Barrier( MPI_COMM_WORLD );
		if( processID == 0 )
		{
		printf( "%d:", processID );
		for( int i = 0; i < 2*numOfProcesses; i++ )
		printf( "%.2f ", recvData[i] );
		printf("\n");
		}
		MPI_Barrier( MPI_COMM_WORLD );
		if( processID == 1 )
		{
		printf( "%d:", processID );
		for( int i = 0; i < 2*numOfProcesses; i++ )
		printf( "%.2f ", recvData[i] );
		printf("\n");
		}*/

		int blockSize = (this->A.getNumColumns()-colPointer) > 1024 ? 1024 : this->A.getNumColumns()-colPointer;
		int numBlocksOnRow = TNL::roundUpDivision( (this->A.getNumColumns()-colPointer), 1024 );
		int numOfBlocks = this->A.getNumRows() * numBlocksOnRow;
		TNL::Containers::Vector< Real, TNL::Devices::Host, Index > outMaxHost( numOfProcesses );
		TNL::Containers::Vector< Index, TNL::Devices::Host, Index > outPosHost( numOfProcesses );

		for( int i = 0; i < numOfProcesses; i++ )
		{
		outMaxHost[ i ] = (Real)recvData[ 2*i ];
		outPosHost[ i ] = (Index)recvData[ 2*i+1 ];
		}
		delete []data;
		delete []recvData;


		/*if( processID == 0 )
		{
		printf( "%d:", processID );
		std::cout << outMaxHost << std::endl;
		std::cout << outPosHost << std::endl;
		}
		MPI_Barrier( MPI_COMM_WORLD );
		if( processID == 1 )
		{
		printf( "%d: ", processID );
		std::cout << outMaxHost << std::endl;
		std::cout << outPosHost << std::endl;
		}*/

		Index ProcessMax = -1;
		Index Possition = -1;
		Real Maximum = 0;

		for( int i = 0; i < numOfProcesses; i++ )
		{
		if( outPosHost[ i ] != -1 )
		if( Maximum < outMaxHost[ i ] )
		{
		Maximum = outMaxHost[ i ];
		ProcessMax = i;
		Possition = outPosHost[ i ];
		}
		}

		/*if( processID == 0 )
		{
		printf( "%d: ", processID );
		std::cout << ProcessMax << " " << Maximum << " " << Possition << std::endl;
		}
		MPI_Barrier( MPI_COMM_WORLD );
		if( processID == 1 )
		{
		printf( "%d: ", processID );
		std::cout << ProcessMax << " " << Maximum << " " << Possition << std::endl;
		}*/
		Real* mainRow;
		int size = this->A.getNumColumns() - colPointerMain + 1;
		mainRow = new Real[size];

		/if( pivoting == "yes" )// && pom != -1 && *pom != colPointer )
		if( ProcessMax != colPointerMain/this->A.getNumRows() )
		{
		if( processID == ProcessMax )
		{
		this->A.getRow( Possition, colPointerMain, mainRow, size );
		mainRow[ size-1 ] = this->b.getElement( Possition );
		}
		} else {
		if( colPointerMain/this->A.getNumRows() == processID ){
		if( Possition != colPointer )
		{
		if( verbose > 1 )
		{
		std::cout << std::endl;
		std::cout << "Choosing element at " << *pom << "-th row as pivot with value..." << std::endl;
		std::cout << "Swapping " << colPointer << "-th and " << *pom << "-th rows ... " << std::endl;
		std::cout << "Choosing element at " << Possition << "-th row as pivot with value..." << std::endl;
		std::cout << "Swapping " << colPointer << "-th and " << Possition << "-th rows ... " << std::endl;
		}
		swapRows<<< numBlocksOnRow, blockSize >>>( devMat, device_vector.getView(), colPointer, numBlocksOnRow, pivot );
		swapRows<<< numBlocksOnRow, blockSize >>>( devMat, device_vector.getView(), colPointer, numBlocksOnRow, Possition );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		}
		this->A.getRow( colPointer, colPointerMain, mainRow, size );
		mainRow[ size-1 ] = this->b.getElement( colPointer );
		}
		}
		TNL::Communicators::MpiCommunicator::Bcast( mainRow, size, colPointerMain/this->A.getNumRows(), MPI_COMM_WORLD);

		TNL::Containers::Vector< Real, TNL::Devices::Host, Index > mainRowVecHost( mainRow, size );
		mainRowVec = mainRowVecHost;
		delete []mainRow;

		if( ProcessMax != colPointerMain/this->A.getNumRows() )
		{
		Real *mainRowSwap;
		mainRowSwap = new Real[size];

		if( processID == ProcessMax )
		{
		TNL::Communicators::MpiCommunicator::Recv( mainRowSwap, size, colPointerMain/this->A.getNumRows(), 0 );
		printf( "%d: ", processID );
		for( int i = 0; i < size; i++ )
		printf( "%.2f ", mainRowSwap[ i ] );
		printf( "\n" );
		this->A.setRow( Possition, colPointerMain, mainRowSwap, size );
		this->b.setElement( Possition, mainRowSwap[ size-1 ] );
		}
		else if( processID == colPointerMain/this->A.getNumRows() )
		{
		this->A.getRow( colPointer, colPointerMain, mainRowSwap, size );
		mainRowSwap[ size-1 ] = this->b.getElement( colPointer );

		TNL::Communicators::MpiCommunicator::ISend( mainRowSwap, size, ProcessMax, 0 );
		printf( "%d: ", processID );
		for( int i = 0; i < size; i++ )
		printf( "%.2f ", mainRow[ i ] );
		printf( "\n" );
		this->A.setRow( colPointer, colPointerMain, mainRow, size );
		this->b.setElement( colPointer, mainRow[ size-1 ] );
		}
		delete []mainRowSwap;
		}
		#ifdef HAVE_MPI
		if( processID == 0 )
		{
		showMatrix<<< 1, 1 >>>( this->A );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		}
		MPI_Barrier(MPI_COMM_WORLD);
		if( processID == 1 )
		{
		showMatrix<<< 1, 1 >>>( this->A );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		}
		MPI_Barrier(MPI_COMM_WORLD);
		if( processID == 2 )
		{
		showMatrix<<< 1, 1 >>>( this->A );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		}
		MPI_Barrier(MPI_COMM_WORLD);
		if( processID == 0 )
		std::cout << this->b << std::endl;
		MPI_Barrier(MPI_COMM_WORLD);
		if( processID == 1 )
		std::cout << this->b << std::endl;
		MPI_Barrier(MPI_COMM_WORLD);
		if( processID == 2 )
		std::cout << this->b << std::endl;
		#endif
		}
		else
		{
		#ifdef HAVE_MPI
		Real* mainRow;
		int size = this->A.getNumColumns() - colPointerMain + 1;
		mainRow = new Real[size];

		if( colPointerMain/this->A.getNumRows() == processID ){
		this->A.getRow( colPointer, colPointerMain, mainRow, size );
		mainRow[ size-1 ] = this->b.getElement( colPointer );

		/*for( int i = 0; i < numOfProcesses; i++ )
		if( i != processID ){
		TNL::Communicators::MpiCommunicator::ISend( mainRow, size, i, 0 );
		}*/
		} //else {
		//TNL::Communicators::MpiCommunicator::Recv( mainRow, size, colPointerMain/this->A.getNumRows(), 0 );
		TNL::Communicators::MpiCommunicator::Bcast( mainRow, size, colPointerMain/this->A.getNumRows(), MPI_COMM_WORLD);
		if( verbose > 2 )
		{
		printf( "%d: [", processID);
		for( int i = 0; i < size; i++ )
		printf( "%.2f ", mainRow[ i ] );
		printf("]\n");
		}

		//}
		TNL::Containers::Vector< Real, TNL::Devices::Host, Index > mainRowVecHost( mainRow, size );
		mainRowVec = mainRowVecHost;
		delete []mainRow;
		#else
		//printf("%d: colPointer = %d, colPointerMain = %d\n", processID, colPointer, colPointerMain );
		this->A.getRow(colPointer, colPointerMain, mainRowVec );
		#endif

		}
		/*std::cout << processID << ": " << std::endl;
		std::cout << mainRowVec << std::endl;*/

		if( verbose > 1 )
		printf( "Elimination: %d/%d\n", colPointer, this->A.getNumColumns() );


		GEMmainKernel<<< numOfBlocks, blockSize >>>( devMat,
		device_vector.getView(),

src/Benchmarks/GEM/gem/GEMkernels.h

+5 −5

Original line number	Diff line number	Diff line
		@@ -60,10 +60,10 @@ __inline__ __device__ void blockReduceArgMax(Real& val, int& index)
		template <typename Real >
		__global__
		void findPivot( Matrix< Real, TNL::Devices::Cuda, int >* A,
		int colPointerMain, TNL::Containers::VectorView< Real, TNL::Devices::Cuda, int > outMaximum,
		int fromRow, int colPointerMain, TNL::Containers::VectorView< Real, TNL::Devices::Cuda, int > outMaximum,
		TNL::Containers::VectorView< int, TNL::Devices::Cuda, int > outPosition)
		{
		int rowPointer = threadIdx.x + blockDim.x * blockIdx.x + colPointerMain;
		int rowPointer = threadIdx.x + blockDim.x * blockIdx.x + fromRow;
		Real firstElementInRow = rowPointer >= A->getNumRows() ? 0 : TNL::abs( A->getElement(rowPointer, colPointerMain) );
		int index = rowPointer;
		blockReduceArgMax( firstElementInRow, index );
		@@ -95,12 +95,12 @@ template <typename Real >
		__global__
		void swapRows( Matrix< Real, TNL::Devices::Cuda, int >* A,
		TNL::Containers::VectorView< Real, TNL::Devices::Cuda, int > b,
		int colPointerMain, int numBlocksOnRow, int* positionPivot )
		int colPointerMain, int numBlocksOnRow, int positionPivot )
		{
		if( *positionPivot > colPointerMain )
		if( positionPivot > colPointerMain )
		{
		int rowPointer1 = colPointerMain;
		int rowPointer2 = *positionPivot;
		int rowPointer2 = positionPivot;
		int colPointer = threadIdx.x + blockDim.x * (blockIdx.x % numBlocksOnRow) + colPointerMain;
		if( colPointer < A->getNumColumns() && rowPointer1 < A->getNumRows() )
		{