MPI device to device for non-pivoting GEM done! (82328fca) · Commits · TNL / tnl-dev

src/Benchmarks/GEM/Matrix/Matrix.h

+10 −1

Original line number	Diff line number	Diff line
		@@ -70,9 +70,18 @@ class Matrix
		*/
		void getRow( Index row, Index col, Real* mainRow, Index size );

		/**
		* Returns ROW on row and starting column. Can be
		* called from host only for mainRow alocated on CPU.
		*
		* @param row and column, mainRow is array with size to be filled with values.
		* @return void.
		*/
		void getRowGPU( Index row, Index col, Real* mainRow, Index size );

		/**
		* Sets ROW on row and starting column into matrix A. Can be
		* called from host only.
		* called from host only for mainRow alocated on GPU.
		*
		* @param row and column, mainRow is array with size to be filled with values.
		* @return void.

src/Benchmarks/GEM/Matrix/Matrix_impl.h

+42 −3

Original line number	Diff line number	Diff line
		@@ -128,6 +128,32 @@ void Matrix< Real, Device, Index >::getRow( Index row, Index col, Real* mainRow,
		#endif
		}

		template < typename Real,
		typename Device,
		typename Index >
		void Matrix< Real, Device, Index >::getRowGPU( Index row, Index col, Real* mainRow, Index size )
		{
		TNL_ASSERT( row > -1 && col > -1, std::cerr << "Matrix cannot have egative row nor negative column!");
		TNL_ASSERT( row < rows && col < columns, std::cerr << "Matrix dosn't have that much rows or columns!");
		#ifdef HAVE_CUDA
		if( std::is_same< Device, TNL::Devices::Cuda >::value )
		{
		Matrix< Real, TNL::Devices::Cuda, Index >* devMat;
		cudaMalloc( ( void** ) &devMat, ( size_t ) sizeof( Matrix< Real, TNL::Devices::Cuda, Index > ) );
		cudaMemcpy( ( void* ) devMat,( void* ) this, sizeof( Matrix< Real, TNL::Devices::Cuda, Index > ), cudaMemcpyHostToDevice );
		TNL_CHECK_CUDA_DEVICE;

		int blockSize = size-1 > 256 ? 256: size-1;
		int gridSize = TNL::roundToMultiple( size-1, blockSize );
		fillArray<<< gridSize, blockSize >>>( devMat, mainRow, size-1, row, col );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		cudaFree( devMat );
		TNL_CHECK_CUDA_DEVICE;
		}
		#endif
		}

		template < typename Real,
		typename Device,
		typename Index >
		@@ -188,13 +214,13 @@ void Matrix< Real, Device, Index >::getRow( Index row, Index col, Vector& mainRo
		#if DEBUG
		printf("On CPU\n");
		#endif
		for( int i = 0; i < mainRow.getSize()-1; i++ )
		for( int i = 0; i < this->getNumColumns()-col; i++ )
		mainRow[ i ] = this->getElement( row, col + i );
		}
		#ifdef HAVE_CUDA
		if( std::is_same< Device, TNL::Devices::Cuda >::value )
		{
		for( int i = 0; i < mainRow.getSize()-1; i++ )
		for( int i = 0; i < this->getNumColumns()-col; i++ )
		mainRow.setElement(i, this->data.getElement( row*TNL::roundToMultiple( this->columns, TNL::Cuda::getWarpSize() ) + col + i ) );
		}
		#endif
		@@ -293,3 +319,16 @@ Matrix< Real, Device, Index >::operator=( Matrix< Real, Device2, Index>& matrix
		#endif // HAVE_CUDA
		return *this;
		}

		#ifdef HAVE_CUDA
		template < typename Real, typename Index >
		__global__ void
		fillArray( Matrix< Real, TNL::Devices::Cuda, int >* A, Real* data, Index size, Index row, Index col )
		{
		int thread = threadIdx.x + blockIdx.x * blockDim.x;
		if( thread < size )
		{
		data[ thread ] = A->getElement( row, col + thread );
		}
		}
		#endif
		No newline at end of file

src/Benchmarks/GEM/gem/GEMdeviceMPI.h

+17 −22

Original line number	Diff line number	Diff line
		@@ -143,7 +143,10 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot
		// Main pointer to row, over all parts of matrices, colPointerMain in (0 - number of rows)
		Index colPointerMain = 0;

		double duration[ this->A.getNumColumns() ];
		// Bcast and main row vector and clasic array
		Real* data;
		TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > mainRowVec( this->A.getNumColumns() + 1 );
		data = mainRowVec.getData();
		// Main cycle for all rows across all MPI parts, vector x is the only one with full size on MPI, or use A.getNumColumns() for rectangular matrices.
		while( colPointerMain < x.getSize() ){
		#ifdef HAVE_MPI
		@@ -154,7 +157,6 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot


		// main row vector for computation (pivoting, non-pivoting)
		TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > mainRowVec( this->A.getNumColumns() - colPointerMain + 1 );

		// Setting number of threads and blocks for main kernel and for pivoting swapping kernel

		@@ -330,45 +332,43 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot
		}
		else // without pivoting
		{
		std::clock_t start;
		start = std::clock();
		#ifdef HAVE_MPI
		//if( processID == 0 )
		//printf( "Initializing mainRow!\n");

		Real* data;

		if( colPointerMain/this->A.getNumRows() == processID ){
		this->A.getRow( colPointer, colPointerMain, mainRowVec );
		mainRowVec.setElement( mainRowVec.getSize()-1, this->b.getElement( colPointer ) );
		this->A.getRowGPU( colPointer, colPointerMain, data, this->A.getNumColumns() - colPointerMain+1 );

		data = mainRowVec.getData();
		mainRowVec.setElement( this->A.getNumColumns() - colPointerMain, this->b.getElement( colPointer ) );
		}
		else
		{
		cudaMalloc( &data, mainRowVec.getSize() * sizeof(Real) );

		if( verbose > 3 ){
		showData<<<1,1>>>(data,mainRowVec.getSize(),processID );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		}

		//printf( "brodcasting mainRow!\n");
		TNL::Communicators::MpiCommunicator::Bcast( data, mainRowVec.getSize(), colPointerMain/this->A.getNumRows(), MPI_COMM_WORLD);
		TNL::Communicators::MpiCommunicator::Bcast( data, this->A.getNumColumns() + 1, colPointerMain/this->A.getNumRows(), MPI_COMM_WORLD);

		mainRowVec.bind( data, this->A.getNumColumns() - colPointerMain + 1 );
		//mainRowVec.bind( data, this->A.getNumColumns() + 1 );

		/*if( verbose > 2 )
		if( verbose > 2 )
		{
		for( int i = 0; i < numOfProcesses; i++ )
		if( i == processID ){
		std::cout << mainRowVec << std::endl;
		}
		MPI_Barrier(MPI_COMM_WORLD);
		}*/
		}

		#else
		this->A.getRow(colPointer, colPointerMain, mainRowVec );
		mainRowVec.setElement( mainRowVec.getSize() - 1, this->b.getElement( colPointer ) );
		#endif
		duration[ colPointerMain ] = ( std::clock() - start ) / (double) CLOCKS_PER_SEC;
		}
		//printf("computing \n");
		if( verbose > 1 )
		{
		#ifdef HAVE_MPI
		@@ -437,11 +437,6 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot
		// increment colPointerMain for next while passage
		colPointerMain++;
		}
		double time;
		for( int i = 0; i < this->A.getNumColumns(); i++ )
		time += duration[ i ];
		time = time/this->A.getNumColumns();
		printf("%d: copy MPI part: %.8f\n", processID, time );
		// delete all used variables
		cudaFree(pivot);
		cudaFree( devMat );

src/Benchmarks/GEM/gem/GEMkernelsMPI.h

+1 −1

Original line number	Diff line number	Diff line
		@@ -153,7 +153,7 @@ void GEMmainKernel( Matrix< Real, TNL::Devices::Cuda, int >* A,
		if( colPointer == colPointerMain && rowPointer + processID * A->getNumRows() != colPointerMain && rowPointer < A->getNumRows()
		&& colPointer < A->getNumColumns() && mainRow[ 0 ] != 0 && A->getElement( rowPointer, colPointerMain ) != 0 )
		{
		b[ rowPointer ] = b[ rowPointer ] - A->getElement( rowPointer, colPointerMain ) * mainRow[ mainRow.getSize()-1 ] / mainRow[ 0 ];
		b[ rowPointer ] = b[ rowPointer ] - A->getElement( rowPointer, colPointerMain ) * mainRow[ A->getNumColumns() - colPointerMain ] / mainRow[ 0 ];
		}
		}