Commit 7d9b98a6 authored by Matouš Fencl's avatar Matouš Fencl Committed by Jakub Klinkovský
Browse files

Pivoting with MPI done, unreadable code!

parent 519f8715
Loading
Loading
Loading
Loading
+11 −3
Original line number Diff line number Diff line
@@ -62,15 +62,23 @@ class Matrix
    __cuda_callable__ Real getElement( Index row, Index col ) const;
    
    /**
    * Returns ROW on row and starting column. Cannot be
    * called from host if the matrix is on device and also cannot be called from
    * device if the matrix is on host.
    * Returns ROW on row and starting column. Can be
    * called from host only.
    *
    * @param row and column, mainRow is array with size to be filled with values.
    * @return void.
    */
    void getRow( Index row, Index col, Real* mainRow, Index size );
    
    /**
    * Sets ROW on row and starting column into matrix A. Can be
    * called from host only.
    *
    * @param row and column, mainRow is array with size to be filled with values.
    * @return void.
    */
    void setRow( Index row, Index col, Real* mainRow, Index size );
    
    /**
    * Returns ROW on row and starting column. Can be
    * called from host if the matrix is on device and also can be called from
+24 −0
Original line number Diff line number Diff line
@@ -128,6 +128,30 @@ void Matrix< Real, Device, Index >::getRow( Index row, Index col, Real* mainRow,
#endif
}

template < typename Real,
        typename Device,
        typename Index >
void Matrix< Real, Device, Index >::setRow( Index row, Index col, Real* mainRow, Index size )
{
  TNL_ASSERT( row > -1 && col > -1, std::cerr << "Matrix cannot have egative row nor negative column!");
  TNL_ASSERT( row < rows && col < columns, std::cerr << "Matrix dosn't have that much rows or columns!");
  if( std::is_same< Device, TNL::Devices::Host >::value )
  {
#if DEBUG
    printf("On CPU\n");
#endif
    for( int i = 0; i < size-1; i++ )
      this->setElement( row, col + i, mainRow[ i ] ); 
  }
#ifdef HAVE_CUDA
  if( std::is_same< Device, TNL::Devices::Cuda >::value )
  {
    for( int i = 0; i < size-1; i++ )
      this->data.setElement( row*TNL::roundToMultiple( this->columns, TNL::Cuda::getWarpSize() ) + col + i, mainRow[ i ] );
  }
#endif
}

template < typename Real,
        typename Device,
        typename Index >
+1 −3
Original line number Diff line number Diff line
@@ -18,8 +18,6 @@
#include <mpi.h>
#endif

#include "gem/GEMkernels.h"
#include "TNL/Cuda/MemoryHelpers.h"
#define COMPARE_RESULTS true;


@@ -99,7 +97,7 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve
    if( verbose > 1 && processID == 0 )
      cout << "starting computation number " << i+1 << endl;
    
    gem.solve( vectorResult, (String)"no", verbose );
    gem.solve( vectorResult, pivoting, verbose );
    duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC;
    
    
+279 −66
Original line number Diff line number Diff line
@@ -3,9 +3,12 @@


#ifdef HAVE_CUDA
//#include "GEMkernels.h"
#include "GEMkernels.h"


#ifdef HAVE_MPI
#include "TNL/Communicators/MpiCommunicator.h"
#include "TNL/Communicators/MPITypeResolver.h"
#include <mpi.h>
#endif

@@ -91,92 +94,302 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting
#ifdef HAVE_MPI
  MPI_Comm_rank( MPI_COMM_WORLD, &processID );
  MPI_Comm_size( MPI_COMM_WORLD, &numOfProcesses );
  Index colPointerMain = 0;
#endif
  Index colPointerMain = 0;
  
  while( colPointerMain < x.getSize() ){
#ifdef HAVE_MPI
    TNL::Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD );
    Index colPointer = colPointerMain-colPointerMain/this->A.getNumRows() * this->A.getNumRows();
    Real* mainRow;
    int size = this->A.getNumColumns() - colPointerMain + 1;
    mainRow = new Real[size];
    
    if( colPointerMain/this->A.getNumRows() == processID ){
      this->A.getRow( colPointer, colPointerMain, mainRow, size );
      mainRow[ size-1 ] = this->b.getElement( colPointer );
      
      for( int i = 0; i < numOfProcesses; i++ )
        if( i != processID ){
          TNL::Communicators::MpiCommunicator::ISend( mainRow, size, i, 0 );
#endif
    Index colPointer = colPointerMain-(Index)(colPointerMain/this->A.getNumRows()) * this->A.getNumRows();
#ifdef HAVE_MPI
    if( processID == 0 )
    {
      showMatrix<<< 1, 1 >>>( this->A );
      cudaDeviceSynchronize();
      TNL_CHECK_CUDA_DEVICE;
    }
    } else {
      TNL::Communicators::MpiCommunicator::Recv( mainRow, size, colPointerMain/this->A.getNumRows(), 0 );
      if( verbose > 2 )
    MPI_Barrier(MPI_COMM_WORLD);
    if( processID == 1 )
    {
        printf( "%d: [", processID);
        for( int i = 0; i < size; i++ )
          printf( "%.2f ", mainRow[ i ] );
        printf("]\n");
      showMatrix<<< 1, 1 >>>( this->A );
      cudaDeviceSynchronize();
      TNL_CHECK_CUDA_DEVICE;
    }
      
    MPI_Barrier(MPI_COMM_WORLD);
    if( processID == 2 )
    {
      showMatrix<<< 1, 1 >>>( this->A );
      cudaDeviceSynchronize();
      TNL_CHECK_CUDA_DEVICE;
    }
    TNL::Containers::Vector< Real, TNL::Devices::Host, Index > mainRowVecHost( mainRow, size );
    TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > mainRowVec;
    mainRowVec = mainRowVecHost;
    delete []mainRow;
#else
    TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > mainRowVec;
    //printf("%d: colPointer = %d, colPointerMain = %d\n", processID, colPointer, colPointerMain );
    this->A.getRow(colPointer, colPointerMain, mainRowVec );
    MPI_Barrier(MPI_COMM_WORLD);
#endif
    /*std::cout << processID << ": " << std::endl;
    std::cout << mainRowVec << std::endl;*/
    
    if( verbose > 1 )
      printf( "Elimination: %d/%d\n", colPointer, this->A.getNumColumns() );
    // main row vector for computation (pivoting, non-pivoting)
    TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > mainRowVec;
    
    /*if( pivoting == "yes" )
    
    int blockSize = (this->A.getNumColumns()-colPointer) > 1024 ? 1024 : this->A.getNumColumns()-colPointer;
    int numBlocksOnRow = TNL::roundUpDivision( (this->A.getNumColumns()-colPointer), 1024 );
    int numOfBlocks =  this->A.getNumRows() * numBlocksOnRow;
    
    if( pivoting == "yes" )
    {
      // PIVOTING
      int reduceBlockSize = (this->A.getNumColumns()-colPointer) > 1024 ? 1024 : 
        TNL::roundToMultiple( this->A.getNumColumns()-colPointer, 32 );  
      int reduceGridSize = TNL::roundUpDivision( this->A.getNumColumns()-colPointer, reduceBlockSize );
      Index fromRow = 0;
      if( processID < (Index)(colPointerMain/this->A.getNumRows()) )
        fromRow = this->A.getNumRows();
      else if( processID == (Index)(colPointerMain/this->A.getNumRows()) )
        fromRow = colPointer;
      
      TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > outMax(1);
      TNL::Containers::Vector< Index, TNL::Devices::Cuda, Index > outPos(1);
      outMax.setValue(0); outPos.setValue(-1);
      
      if( fromRow != this->A.getNumRows() )
      {
        int reduceBlockSize = (this->A.getNumRows()-fromRow) > 1024 ? 1024 : 
          TNL::roundToMultiple( this->A.getNumRows()-fromRow, 32 );  
        int reduceGridSize = TNL::roundUpDivision( this->A.getNumRows()-fromRow, reduceBlockSize );
        int reduceGridSizeRound = TNL::roundToMultiple( reduceGridSize, 32 );
      //printf("%d, %d, %d\n", reduceBlockSize, reduceGridSize, reduceGridSizeRound );
        
      TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > outMax(reduceGridSize);
      TNL::Containers::Vector< Index, TNL::Devices::Cuda, Index > outPos(reduceGridSize);
        printf("%d,%d: reduceBlockSize = %d, reduceGridSize = %d, reduceGridSizeRound = %d %d\n",
                colPointerMain, processID, reduceBlockSize, reduceGridSize, reduceGridSizeRound, fromRow );
        
        outMax.setSize( reduceGridSize );
        outPos.setSize( reduceGridSize );
        //outMax.setValue(0); outPos.setValue(0);
        
      findPivot<<< reduceGridSize, reduceBlockSize >>>( devMat, colPointer, outMax.getView(), outPos.getView() );
        findPivot<<< reduceGridSize, reduceBlockSize >>>( devMat, fromRow, colPointerMain, outMax.getView(), outPos.getView() );
        cudaDeviceSynchronize();
        TNL_CHECK_CUDA_DEVICE;
        
        findRowPivot<<< 1, reduceGridSizeRound >>>( outMax.getView(), outPos.getView(), pivot );
        cudaDeviceSynchronize();
        TNL_CHECK_CUDA_DEVICE; 
      *pom = 0;
      cudaMemcpy( pom, pivot, sizeof(int), cudaMemcpyDeviceToHost);
        //*pom = 0;
        //cudaMemcpy( pom, pivot, sizeof(int), cudaMemcpyDeviceToHost);
      }
      //std::cout << processID << ": " << outMax << outPos << std::endl;
      
      Real *data, *recvData;
      data = new Real[2];
      recvData = new Real[2*numOfProcesses];
      data[0] = outMax.getElement(0); data[1] = outPos.getElement(0);
      MPI_Barrier( MPI_COMM_WORLD );
      
      MPI_Allgather( data, 2, TNL::Communicators::MPITypeResolver< Real >::getType(),
              recvData, 2, TNL::Communicators::MPITypeResolver< Real >::getType(), MPI_COMM_WORLD);
      
      /*MPI_Barrier( MPI_COMM_WORLD );
      if( processID == 0 )
      {
        printf( "%d:", processID );
        for( int i = 0; i < 2*numOfProcesses; i++ )
          printf( "%.2f ", recvData[i] );
        printf("\n"); 
      }
      MPI_Barrier( MPI_COMM_WORLD );
      if( processID == 1 )
      {
        printf( "%d:", processID );
        for( int i = 0; i < 2*numOfProcesses; i++ )
          printf( "%.2f ", recvData[i] );
        printf("\n"); 
      }*/
      
    int blockSize = (this->A.getNumColumns()-colPointer) > 1024 ? 1024 : this->A.getNumColumns()-colPointer;
    int numBlocksOnRow = TNL::roundUpDivision( (this->A.getNumColumns()-colPointer), 1024 );
    int numOfBlocks =  this->A.getNumRows() * numBlocksOnRow;
      TNL::Containers::Vector< Real, TNL::Devices::Host, Index > outMaxHost( numOfProcesses );
      TNL::Containers::Vector< Index, TNL::Devices::Host, Index > outPosHost( numOfProcesses );
           
      for( int i = 0; i < numOfProcesses; i++ )
      {
        outMaxHost[ i ] = (Real)recvData[ 2*i ];
        outPosHost[ i ] = (Index)recvData[ 2*i+1 ];
      }
      delete []data;
      delete []recvData;
      
      
      /*if( processID == 0 )
      {
        printf( "%d:", processID );
        std::cout << outMaxHost << std::endl;
        std::cout << outPosHost << std::endl;
      }
      MPI_Barrier( MPI_COMM_WORLD );
      if( processID == 1 )
      {
        printf( "%d: ", processID );
        std::cout << outMaxHost << std::endl;
        std::cout << outPosHost << std::endl;
      }*/
      
      Index ProcessMax = -1;
      Index Possition = -1;
      Real Maximum = 0;
      
      for( int i = 0; i < numOfProcesses; i++ )
      {
        if( outPosHost[ i ] != -1 )
          if( Maximum < outMaxHost[ i ] )
          {
            Maximum = outMaxHost[ i ];
            ProcessMax = i;
            Possition = outPosHost[ i ];
          }
      }
      
      /*if( processID == 0 )
      {
        printf( "%d: ", processID );
        std::cout << ProcessMax << " " << Maximum << " " << Possition << std::endl; 
      }
      MPI_Barrier( MPI_COMM_WORLD );
      if( processID == 1 )
      {
        printf( "%d: ", processID );
        std::cout << ProcessMax << " " << Maximum << " " << Possition << std::endl; 
      }*/
      Real* mainRow;
      int size = this->A.getNumColumns() - colPointerMain + 1;
      mainRow = new Real[size];
      
    /*if( pivoting == "yes" )// && *pom != -1 && *pom != colPointer )
      if( ProcessMax != colPointerMain/this->A.getNumRows() )
      {
        if( processID == ProcessMax )
        {
          this->A.getRow( Possition, colPointerMain, mainRow, size );
          mainRow[ size-1 ] = this->b.getElement( Possition );
        }
      } else {
        if( colPointerMain/this->A.getNumRows() == processID ){
          if( Possition != colPointer )
          {
            if( verbose > 1 )
            {
              std::cout << std::endl;
        std::cout << "Choosing element at " << *pom << "-th row as pivot with value..."  << std::endl;
        std::cout << "Swapping " << colPointer << "-th and " << *pom <<  "-th rows ... " << std::endl;
              std::cout << "Choosing element at " << Possition << "-th row as pivot with value..."  << std::endl;
              std::cout << "Swapping " << colPointer << "-th and " << Possition <<  "-th rows ... " << std::endl;
            }
      swapRows<<< numBlocksOnRow, blockSize >>>( devMat, device_vector.getView(), colPointer, numBlocksOnRow, pivot );
            swapRows<<< numBlocksOnRow, blockSize >>>( devMat, device_vector.getView(), colPointer, numBlocksOnRow, Possition );
            cudaDeviceSynchronize();
            TNL_CHECK_CUDA_DEVICE;
          }
          this->A.getRow( colPointer, colPointerMain, mainRow, size );
          mainRow[ size-1 ] = this->b.getElement( colPointer );
        } 
      }
      TNL::Communicators::MpiCommunicator::Bcast( mainRow, size, colPointerMain/this->A.getNumRows(), MPI_COMM_WORLD);
      
      TNL::Containers::Vector< Real, TNL::Devices::Host, Index > mainRowVecHost( mainRow, size );
      mainRowVec = mainRowVecHost;
      delete []mainRow;
      
      if( ProcessMax != colPointerMain/this->A.getNumRows() )
      {
        Real *mainRowSwap;
        mainRowSwap = new Real[size];
        
        if( processID == ProcessMax )
        {
          TNL::Communicators::MpiCommunicator::Recv( mainRowSwap, size, colPointerMain/this->A.getNumRows(), 0 );
          printf( "%d: ", processID );
          for( int i = 0; i < size; i++ )
            printf( "%.2f ", mainRowSwap[ i ] );
          printf( "\n" );
          this->A.setRow( Possition, colPointerMain, mainRowSwap, size );
          this->b.setElement( Possition, mainRowSwap[ size-1 ] );
        }
        else if( processID == colPointerMain/this->A.getNumRows() )
        {
          this->A.getRow( colPointer, colPointerMain, mainRowSwap, size );
          mainRowSwap[ size-1 ] = this->b.getElement( colPointer );
          
          TNL::Communicators::MpiCommunicator::ISend( mainRowSwap, size, ProcessMax, 0 );
          printf( "%d: ", processID );
          for( int i = 0; i < size; i++ )
            printf( "%.2f ", mainRow[ i ] );
          printf( "\n" );
          this->A.setRow( colPointer, colPointerMain, mainRow, size );
          this->b.setElement( colPointer, mainRow[ size-1 ] );
        }        
        delete []mainRowSwap;
      }
      #ifdef HAVE_MPI
       if( processID == 0 )
       {
       showMatrix<<< 1, 1 >>>( this->A );
       cudaDeviceSynchronize();
       TNL_CHECK_CUDA_DEVICE;
       }
       MPI_Barrier(MPI_COMM_WORLD);
       if( processID == 1 )
       {
       showMatrix<<< 1, 1 >>>( this->A );
       cudaDeviceSynchronize();
       TNL_CHECK_CUDA_DEVICE;
       }
       MPI_Barrier(MPI_COMM_WORLD);
       if( processID == 2 )
       {
       showMatrix<<< 1, 1 >>>( this->A );
       cudaDeviceSynchronize();
       TNL_CHECK_CUDA_DEVICE;
       }
       MPI_Barrier(MPI_COMM_WORLD);
       if( processID == 0 )
       std::cout << this->b << std::endl;
       MPI_Barrier(MPI_COMM_WORLD);
       if( processID == 1 )
       std::cout << this->b << std::endl;
       MPI_Barrier(MPI_COMM_WORLD);
       if( processID == 2 )
       std::cout << this->b << std::endl;
       #endif
    }
    else 
    {
#ifdef HAVE_MPI
      Real* mainRow;
      int size = this->A.getNumColumns() - colPointerMain + 1;
      mainRow = new Real[size];
      
      if( colPointerMain/this->A.getNumRows() == processID ){
        this->A.getRow( colPointer, colPointerMain, mainRow, size );
        mainRow[ size-1 ] = this->b.getElement( colPointer );
        
        /*for( int i = 0; i < numOfProcesses; i++ )
         if( i != processID ){
         TNL::Communicators::MpiCommunicator::ISend( mainRow, size, i, 0 );
         }*/
      } //else {
      //TNL::Communicators::MpiCommunicator::Recv( mainRow, size, colPointerMain/this->A.getNumRows(), 0 );
      TNL::Communicators::MpiCommunicator::Bcast( mainRow, size, colPointerMain/this->A.getNumRows(), MPI_COMM_WORLD);
      if( verbose > 2 )
      {
        printf( "%d: [", processID);
        for( int i = 0; i < size; i++ )
          printf( "%.2f ", mainRow[ i ] );
        printf("]\n");
      }
      
      //}
      TNL::Containers::Vector< Real, TNL::Devices::Host, Index > mainRowVecHost( mainRow, size );
      mainRowVec = mainRowVecHost;
      delete []mainRow;
#else
      //printf("%d: colPointer = %d, colPointerMain = %d\n", processID, colPointer, colPointerMain );
      this->A.getRow(colPointer, colPointerMain, mainRowVec );
#endif
      
    }
    /*std::cout << processID << ": " << std::endl;
     std::cout << mainRowVec << std::endl;*/
    
    if( verbose > 1 )
      printf( "Elimination: %d/%d\n", colPointer, this->A.getNumColumns() );
    
    
    GEMmainKernel<<< numOfBlocks, blockSize >>>( devMat, 
            device_vector.getView(),
+5 −5
Original line number Diff line number Diff line
@@ -60,10 +60,10 @@ __inline__ __device__ void blockReduceArgMax(Real& val, int& index)
template <typename Real >
__global__ 
void findPivot( Matrix< Real, TNL::Devices::Cuda, int >* A, 
        int colPointerMain, TNL::Containers::VectorView< Real, TNL::Devices::Cuda, int > outMaximum,
        int fromRow, int colPointerMain, TNL::Containers::VectorView< Real, TNL::Devices::Cuda, int > outMaximum,
        TNL::Containers::VectorView< int, TNL::Devices::Cuda, int > outPosition)
{
  int rowPointer = threadIdx.x + blockDim.x * blockIdx.x + colPointerMain;
  int rowPointer = threadIdx.x + blockDim.x * blockIdx.x + fromRow;
  Real firstElementInRow = rowPointer >= A->getNumRows() ? 0 : TNL::abs( A->getElement(rowPointer, colPointerMain) );
  int index = rowPointer;
  blockReduceArgMax( firstElementInRow, index );
@@ -95,12 +95,12 @@ template <typename Real >
__global__ 
void swapRows( Matrix< Real, TNL::Devices::Cuda, int >* A, 
        TNL::Containers::VectorView< Real, TNL::Devices::Cuda, int > b,
        int colPointerMain, int numBlocksOnRow, int* positionPivot )
        int colPointerMain, int numBlocksOnRow, int positionPivot )
{
  if( *positionPivot > colPointerMain )
  if( positionPivot > colPointerMain )
  {
    int rowPointer1 = colPointerMain;
    int rowPointer2 = *positionPivot;
    int rowPointer2 = positionPivot;
    int colPointer = threadIdx.x + blockDim.x * (blockIdx.x % numBlocksOnRow) + colPointerMain;
    if( colPointer < A->getNumColumns() && rowPointer1 < A->getNumRows() )
    {