Commit 82328fca authored by Matouš Fencl's avatar Matouš Fencl Committed by Jakub Klinkovský
Browse files

MPI device to device for non-pivoting GEM done!

parent 46164f84
Loading
Loading
Loading
Loading
+10 −1
Original line number Diff line number Diff line
@@ -70,9 +70,18 @@ class Matrix
    */
    void getRow( Index row, Index col, Real* mainRow, Index size );
    
    /**
    * Returns ROW on row and starting column. Can be
    * called from host only for mainRow alocated on CPU.
    *
    * @param row and column, mainRow is array with size to be filled with values.
    * @return void.
    */
    void getRowGPU( Index row, Index col, Real* mainRow, Index size );
    
    /**
    * Sets ROW on row and starting column into matrix A. Can be
    * called from host only.
    * called from host only for mainRow alocated on GPU.
    *
    * @param row and column, mainRow is array with size to be filled with values.
    * @return void.
+42 −3
Original line number Diff line number Diff line
@@ -128,6 +128,32 @@ void Matrix< Real, Device, Index >::getRow( Index row, Index col, Real* mainRow,
#endif
}

template < typename Real,
           typename Device,
           typename Index >
void Matrix< Real, Device, Index >::getRowGPU( Index row, Index col, Real* mainRow, Index size )
{
  TNL_ASSERT( row > -1 && col > -1, std::cerr << "Matrix cannot have egative row nor negative column!");
  TNL_ASSERT( row < rows && col < columns, std::cerr << "Matrix dosn't have that much rows or columns!");
#ifdef HAVE_CUDA
  if( std::is_same< Device, TNL::Devices::Cuda >::value )
  {
    Matrix< Real, TNL::Devices::Cuda, Index >* devMat;
    cudaMalloc( ( void** ) &devMat, ( size_t ) sizeof( Matrix< Real, TNL::Devices::Cuda, Index > ) );
    cudaMemcpy( ( void* ) devMat,( void* ) this, sizeof( Matrix< Real, TNL::Devices::Cuda, Index > ), cudaMemcpyHostToDevice );
    TNL_CHECK_CUDA_DEVICE;

    int blockSize = size-1 > 256 ? 256: size-1;
    int gridSize = TNL::roundToMultiple( size-1, blockSize );
    fillArray<<< gridSize, blockSize >>>( devMat, mainRow, size-1, row, col );
    cudaDeviceSynchronize();
    TNL_CHECK_CUDA_DEVICE;
    cudaFree( devMat );
    TNL_CHECK_CUDA_DEVICE;
  }
#endif
}

template < typename Real,
        typename Device,
        typename Index >
@@ -188,13 +214,13 @@ void Matrix< Real, Device, Index >::getRow( Index row, Index col, Vector& mainRo
#if DEBUG
    printf("On CPU\n");
#endif
    for( int i = 0; i < mainRow.getSize()-1; i++ )
    for( int i = 0; i < this->getNumColumns()-col; i++ )
      mainRow[ i ] = this->getElement( row, col + i ); 
  }
#ifdef HAVE_CUDA
  if( std::is_same< Device, TNL::Devices::Cuda >::value )
  {
    for( int i = 0; i < mainRow.getSize()-1; i++ )
    for( int i = 0; i < this->getNumColumns()-col; i++ )
      mainRow.setElement(i, this->data.getElement( row*TNL::roundToMultiple( this->columns, TNL::Cuda::getWarpSize() ) + col + i ) );
  }
#endif
@@ -293,3 +319,16 @@ Matrix< Real, Device, Index >::operator=( Matrix< Real, Device2, Index>& matrix
#endif // HAVE_CUDA
  return *this;
}

#ifdef HAVE_CUDA
template < typename Real, typename Index >
__global__ void 
fillArray( Matrix< Real, TNL::Devices::Cuda, int >* A, Real* data, Index size, Index row, Index col )
{
  int thread = threadIdx.x + blockIdx.x * blockDim.x;
  if( thread < size )
  {
    data[ thread ] = A->getElement( row, col + thread );
  }
}
#endif
 No newline at end of file
+17 −22
Original line number Diff line number Diff line
@@ -143,7 +143,10 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot
  // Main pointer to row, over all parts of matrices, colPointerMain in (0 - number of rows)
  Index colPointerMain = 0;
  
  double duration[ this->A.getNumColumns() ];
  // Bcast and main row vector and clasic array
  Real* data;
  TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > mainRowVec( this->A.getNumColumns() + 1 );
  data = mainRowVec.getData();
  // Main cycle for all rows across all MPI parts, vector x is the only one with full size on MPI, or use A.getNumColumns() for rectangular matrices.
  while( colPointerMain < x.getSize() ){
#ifdef HAVE_MPI
@@ -154,7 +157,6 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot

    
    // main row vector for computation (pivoting, non-pivoting)
    TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > mainRowVec( this->A.getNumColumns() - colPointerMain + 1 );
    
    // Setting number of threads and blocks for main kernel and for pivoting swapping kernel
        
@@ -330,45 +332,43 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot
    }
    else // without pivoting
    {
    std::clock_t start;
    start = std::clock();
#ifdef HAVE_MPI
      //if( processID == 0 )
      //printf( "Initializing mainRow!\n");
      
      Real* data;
      
      if( colPointerMain/this->A.getNumRows() == processID ){
        this->A.getRow( colPointer, colPointerMain, mainRowVec );
        mainRowVec.setElement( mainRowVec.getSize()-1, this->b.getElement( colPointer ) );
        this->A.getRowGPU( colPointer, colPointerMain, data, this->A.getNumColumns() - colPointerMain+1 );
        
        data = mainRowVec.getData();
        mainRowVec.setElement( this->A.getNumColumns() - colPointerMain, this->b.getElement( colPointer ) );
      } 
      else
      {
        cudaMalloc( &data, mainRowVec.getSize() * sizeof(Real) );
      
      if( verbose > 3 ){
        showData<<<1,1>>>(data,mainRowVec.getSize(),processID );
        cudaDeviceSynchronize();
        TNL_CHECK_CUDA_DEVICE;
      }
      
      //printf( "brodcasting mainRow!\n");
      TNL::Communicators::MpiCommunicator::Bcast( data, mainRowVec.getSize(), colPointerMain/this->A.getNumRows(), MPI_COMM_WORLD);
      TNL::Communicators::MpiCommunicator::Bcast( data, this->A.getNumColumns() + 1, colPointerMain/this->A.getNumRows(), MPI_COMM_WORLD);
      
      mainRowVec.bind( data, this->A.getNumColumns() - colPointerMain + 1 );
      //mainRowVec.bind( data, this->A.getNumColumns() + 1 );
      
      /*if( verbose > 2 )
      if( verbose > 2 )
      {
        for( int i = 0; i < numOfProcesses; i++ )
          if( i == processID ){
            std::cout << mainRowVec << std::endl;
          }
        MPI_Barrier(MPI_COMM_WORLD);
      }*/
      }
      
#else
      this->A.getRow(colPointer, colPointerMain, mainRowVec );
      mainRowVec.setElement( mainRowVec.getSize() - 1, this->b.getElement( colPointer ) );
#endif
      duration[ colPointerMain ] = ( std::clock() - start ) / (double) CLOCKS_PER_SEC;
    }
      //printf("computing \n");
    if( verbose > 1 )
    {
#ifdef HAVE_MPI
@@ -437,11 +437,6 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot
    // increment colPointerMain for next while passage
    colPointerMain++;
  }
  double time;
  for( int i = 0; i < this->A.getNumColumns(); i++ )
    time += duration[ i ];
  time = time/this->A.getNumColumns();
  printf("%d: copy MPI part: %.8f\n", processID, time );
  // delete all used variables
  cudaFree(pivot);
  cudaFree( devMat );
+1 −1
Original line number Diff line number Diff line
@@ -153,7 +153,7 @@ void GEMmainKernel( Matrix< Real, TNL::Devices::Cuda, int >* A,
  if( colPointer == colPointerMain && rowPointer + processID * A->getNumRows() != colPointerMain && rowPointer < A->getNumRows() 
          && colPointer < A->getNumColumns() && mainRow[ 0 ] != 0 && A->getElement( rowPointer, colPointerMain ) != 0  )
  {
    b[ rowPointer ] = b[ rowPointer ] - A->getElement( rowPointer, colPointerMain ) * mainRow[ mainRow.getSize()-1 ] / mainRow[ 0 ];
    b[ rowPointer ] = b[ rowPointer ] - A->getElement( rowPointer, colPointerMain ) * mainRow[ A->getNumColumns() - colPointerMain ] / mainRow[ 0 ];
  }
}