Commit 111cd841 authored by Matouš Fencl's avatar Matouš Fencl Committed by Jakub Klinkovský
Browse files

Went back in commits for old GPU code. Now MPI turns on MPI code and without...

Went back in commits for old GPU code. Now MPI turns on MPI code and without MPI turns on old GPU code.
parent 22699056
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -5,7 +5,7 @@ PROJECT_NAME = tnl-gem
TNL_HEADERS = ${HOME}/.local/include
INSTALL_DIR = ${HOME}/.local
WITH_CUDA = yes
WITH_MPI = yes
WITH_MPI = no
WITH_OPENMP = yes
WITH_DEBUG = yes
MPI_FLAGT =
+1 −3
Original line number Diff line number Diff line
@@ -74,9 +74,6 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve
  
  if( verbose > 1 )
    cout << "Starting computation on " << device << endl;
  /*char hostname[256];
  gethostname(hostname, sizeof(hostname));
  printf("PID %d on %s ready for attach\n", getpid(), hostname);*/
  
  for( int i = 0; i < loops; i++ )
  {
@@ -98,6 +95,7 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve
      cout << "starting computation number " << i+1 << endl;
    
    gem.solve( vectorResult, pivoting, verbose );
    
    duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC;
    
    
+4 −0
Original line number Diff line number Diff line
@@ -34,6 +34,10 @@ class GEM
    
    bool computeLUDecomposition( int verbose = 0 );
    
#ifdef HAVE_MPI
    bool GEMdeviceMPI( Array& x, const TNL::String& pivoting, int verbose );
#endif
    
    bool GEMdevice( Array& x, const TNL::String& pivoting, int verbose );
    
    bool setMatrixVector( MatrixGEM& A, Array& b ){
+11 −1
Original line number Diff line number Diff line
@@ -13,7 +13,11 @@
#pragma once

#include <TNL/Assert.h>
#ifdef HAVE_MPI
#include "GEMdeviceMPI.h"
#else
#include "GEMdevice.h"
#endif
#include "GEM.h"
#include <fstream>
  
@@ -295,9 +299,15 @@ class GEMDeviceDependentCode< TNL::Devices::Cuda >
                   TNL::Containers::Vector< Real, DeviceType, Index >& x, const TNL::String& pivoting, int verbose )
    {
#ifdef HAVE_CUDA
#ifdef HAVE_MPI
      if( verbose > 1 )
        printf("Starting the computation SolveGEM.\n");
        printf("Starting the computation SolveGEM MPI.\n");
      gem.GEMdeviceMPI( x, pivoting, verbose );
#else
      if( verbose > 1 )
        printf("Starting the computation SolveGEM MPI.\n");
      gem.GEMdevice( x, pivoting, verbose );
#endif
#endif
      return true;
    }
+55 −376
Original line number Diff line number Diff line
#include "TNL/Cuda/MemoryHelpers.h"
#define DEBUG 0
#include <fstream> // saving and loading vector.txt
#include <string> // input from cmd

template < typename Real, typename Index >
void saveVec( Real* mainRow, Index size, int processID, Index colPointerMain )
{
  std::ofstream outdata; // outdata is like cin
  std::string s( "./test-matrices/mainRow" );
  s = s + std::to_string(processID) + "_" + std::to_string(colPointerMain);
  outdata.open(s); // opens the file
  if( !outdata ) { // file couldn't be opened
    std::cerr << "Error: file could not be opened" << std::endl;
    exit(1);
  }

  for( int i = 0; i < size; i++ )
  {
    outdata << mainRow[ i ] << std::endl;
  }
  outdata.close();
}

#ifdef HAVE_CUDA
#include "GEMkernels.h"


#ifdef HAVE_MPI
#include "TNL/Communicators/MpiCommunicator.h"
#include "TNL/Communicators/MPITypeResolver.h"
#include <mpi.h>
#endif

template < typename Real,
        typename Index >
void calculResultVector( Matrix< Real, TNL::Devices::Cuda, Index >& matrix,
                TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& device_vector, 
        TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& x, 
        int processID, int numOfProcesses )
                TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& x )
{ 
  Matrix< Real, TNL::Devices::Cuda, Index >* devMat;// = TNL::Cuda::passToDevice( this->A );
  cudaMalloc( ( void** ) &devMat, ( size_t ) sizeof( Matrix< Real, TNL::Devices::Cuda, Index > ) );
  cudaMemcpy( ( void* ) devMat,( void* ) &matrix, sizeof( Matrix< Real, TNL::Devices::Cuda, Index > ), cudaMemcpyHostToDevice );
  TNL_CHECK_CUDA_DEVICE;
  Matrix< Real, TNL::Devices::Cuda, Index >* devMat = TNL::Cuda::passToDevice( matrix);
  
  int blockSize = matrix.getNumRows() > 1024 ? 1024 : matrix.getNumRows();
  int blockSize = matrix.getNumRows() > 1024 ? 1024 : matrix.getNumColumns();
  int numBlocksOnColumn = TNL::roundUpDivision( matrix.getNumRows(), 1024 );
  int numOfBlocks =  matrix.getNumRows() * numBlocksOnColumn;
    
  
  GEMDiagToResult<<< numOfBlocks, blockSize >>>( devMat,device_vector.getView(), processID );
  GEMDiagToResult<<< numOfBlocks, blockSize >>>( devMat,device_vector.getView(), x.getView() );
  cudaDeviceSynchronize();
  TNL_CHECK_CUDA_DEVICE;
#ifdef HAVE_MPI
  if( processID != 0 )
  {
    TNL::Containers::Vector< Real, TNL::Devices::Host, Index > host_vector;
    host_vector = device_vector;
    Real* data = host_vector.getData();
    
    if(processID*device_vector.getSize() < x.getSize() ){
      TNL::Communicators::MpiCommunicator::ISend( data, device_vector.getSize(), 0, 0 );
    }
    
  }else{
    for( int j = 0; j < device_vector.getSize(); j++ )
      x.setElement( j, device_vector.getElement( j ) );
    for( int i = 1; i < numOfProcesses && i*device_vector.getSize() < x.getSize(); i++ )
    {
      Real* data;
      data = new Real[ device_vector.getSize() ];
      TNL::Communicators::MpiCommunicator::Recv( data, device_vector.getSize(), i, 0 );
      
      for( int j = 0; j < device_vector.getSize() && i * device_vector.getSize() + j < x.getSize(); j++ )
        x.setElement( i * device_vector.getSize() + j, data[ j ] );
      delete []data;
    }
  }
  //std::cout << processID << ": " << x << std::endl;
#else
  x = device_vector;
#endif
  cudaFree( ( void* ) devMat );
  TNL_CHECK_CUDA_DEVICE;
} 


@@ -92,345 +32,84 @@ template < typename Real,
        typename Index >
bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting, int verbose )
{
  // Copy matrix A and vector b to GPU 
  Matrix< Real, TNL::Devices::Cuda, Index >* devMat;
  cudaMalloc( ( void** ) &devMat, ( size_t ) sizeof( Matrix< Real, TNL::Devices::Cuda, Index > ) );
  cudaMemcpy( ( void* ) devMat,( void* ) &this->A, sizeof( Matrix< Real, TNL::Devices::Cuda, Index > ), cudaMemcpyHostToDevice );
  TNL_CHECK_CUDA_DEVICE;
  Matrix< Real, TNL::Devices::Cuda, Index >* devMat = TNL::Cuda::passToDevice( this->A );
  TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& device_vector( this->b );
  
  // FOR PIVOTING SET VARIABLES ON DEVICE
  int* pivot; cudaMalloc(&pivot, sizeof(int));
    
  // Initialise MPI variables even without MPI
  int processID=0;
  int numOfProcesses=1;
  
#ifdef HAVE_MPI
  MPI_Comm_rank( MPI_COMM_WORLD, &processID );
  MPI_Comm_size( MPI_COMM_WORLD, &numOfProcesses );
#endif
  size_t size = sizeof(int);
  int* pivot; cudaMalloc(&pivot, size);
  int* pom = (int*)malloc(size); *pom = -1;
  
  if( verbose > 2 )
  {
#ifdef HAVE_MPI
    for( int i = 0; i < numOfProcesses; i++ )
    {
      if( processID == i )
      {
        showMatrix<<< 1, 1 >>>( this->A );
        cudaDeviceSynchronize();
        TNL_CHECK_CUDA_DEVICE;
      }
      MPI_Barrier(MPI_COMM_WORLD);
    }
    for( int i = 0; i < numOfProcesses; i++ )
    {
      if( i == processID )
        std::cout << device_vector;
      MPI_Barrier(MPI_COMM_WORLD);
    }
    std::cout << std::endl;
#else
    showMatrix<<< 1, 1 >>>( this->A );
    cudaDeviceSynchronize();
    TNL_CHECK_CUDA_DEVICE;
    printf("\n");
#endif
  }
  
  // Main pointer to row, over all parts of matrices, colPointerMain in (0 - number of rows)
  Index colPointerMain = 0;
  
  // Main cycle for all rows across all MPI parts, vector x is the only one with full size on MPI, or use A.getNumColumns() for rectangular matrices.
  while( colPointerMain < x.getSize() ){
#ifdef HAVE_MPI
    TNL::Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD );
#endif
    // Pointer to rows in main process that has colPointerMain.
    Index colPointer = colPointerMain-TNL::floor(colPointerMain/this->A.getNumRows()) * this->A.getNumRows();

    
    // main row vector for computation (pivoting, non-pivoting)
    TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > mainRowVec( this->A.getNumColumns() - colPointerMain + 1 );
    
    // Setting number of threads and blocks for main kernel and for pivoting swapping kernel
    int blockSize = (this->A.getNumColumns()-colPointer) > 1024 ? 1024 : this->A.getNumColumns()-colPointer;
    int numBlocksOnRow = TNL::roundUpDivision( (this->A.getNumColumns()-colPointer), 1024 );
    int numOfBlocks =  this->A.getNumRows() * numBlocksOnRow;
  
  for( int colPointer = 0; colPointer < this->A.getNumColumns(); colPointer++ )
  {
    if( verbose > 1 )
      printf( "Elimination: %d/%d\n", colPointer, this->A.getNumColumns() );
    
    if( pivoting == "yes" )
    {
      // PIVOTING
      // fromRow saves info for each process saying from which row to start looking for pivot
      Index fromRow = 0;
      if( processID < TNL::floor(colPointerMain/this->A.getNumRows()) )
        fromRow = this->A.getNumRows();
      else if( processID == TNL::floor(colPointerMain/this->A.getNumRows()) )
        fromRow = colPointer;
      
      // Inicialising vectors for maximum and position (vectors important for multiple blocks, otherwise its vector with length 1)
      TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > outMax(1);
      TNL::Containers::Vector< Index, TNL::Devices::Cuda, Index > outPos(1);
      outMax.setValue(0); outPos.setValue(-1);
      
      
      // those blocks that have rows to look for pivot in should start looking
      if( fromRow != this->A.getNumRows() )
      {
        // setting size for kernel of pivoting
        int reduceBlockSize = (this->A.getNumRows()-fromRow) > 1024 ? 1024 : 
          TNL::roundToMultiple( this->A.getNumRows()-fromRow, 32 );  
        int reduceGridSize = TNL::roundUpDivision( this->A.getNumRows()-fromRow, reduceBlockSize );
      int reduceBlockSize = (this->A.getNumColumns()-colPointer) > 1024 ? 1024 : 
        TNL::roundToMultiple( this->A.getNumColumns()-colPointer, 32 );  
      int reduceGridSize = TNL::roundUpDivision( this->A.getNumColumns()-colPointer, reduceBlockSize );
      int reduceGridSizeRound = TNL::roundToMultiple( reduceGridSize, 32 );
      //printf("%d, %d, %d\n", reduceBlockSize, reduceGridSize, reduceGridSizeRound );

        // resizing outMax and outPos for the kernel of pivoting
        outMax.setSize( reduceGridSize );
        outPos.setSize( reduceGridSize );
      TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > outMax(reduceGridSize);
      TNL::Containers::Vector< Index, TNL::Devices::Cuda, Index > outPos(reduceGridSize);
      //outMax.setValue(0); outPos.setValue(0);

        // two main pivoting kernels executes and saves max and position into 0-th element of vectors outMax and outPos
        findPivot<<< reduceGridSize, reduceBlockSize >>>( devMat, fromRow, colPointerMain, outMax.getView(), outPos.getView() );
      findPivot<<< reduceGridSize, reduceBlockSize >>>( devMat, colPointer, outMax.getView(), outPos.getView() );
      cudaDeviceSynchronize();
      TNL_CHECK_CUDA_DEVICE;

      findRowPivot<<< 1, reduceGridSizeRound >>>( outMax.getView(), outPos.getView(), pivot );
      cudaDeviceSynchronize();
      TNL_CHECK_CUDA_DEVICE;
      *pom = 0;
      cudaMemcpy( pom, pivot, size, cudaMemcpyDeviceToHost);
    }
      // Now each process has info about maximum in the "active" part of matrix that they calculate with
      
      // Preparing dynamic arrays for MPI send and recv resp. AllGather.
      // data stores information to send from each process
      // recvData stores information that is received
#ifdef HAVE_MPI
      Real *data, *recvData;
      data = new Real[2];
      recvData = new Real[2*numOfProcesses];
      data[0] = outMax.getElement(0); data[1] = outPos.getElement(0);
      MPI_Barrier( MPI_COMM_WORLD );
    
      MPI_Allgather( data, 2, TNL::Communicators::MPITypeResolver< Real >::getType(),
              recvData, 2, TNL::Communicators::MPITypeResolver< Real >::getType(), MPI_COMM_WORLD);
#endif      
    
      // Initialising maximum and possition and process that has the overall maximum ( ?: for non-MPI program)
      Index ProcessMax = numOfProcesses != 1 ? -1 : 0;
      Index Possition = numOfProcesses != 1 ? 0 : outPos.getElement(0);
      Real Maximum = numOfProcesses != 1 ? 0 : outMax.getElement(0);
      
#ifdef HAVE_MPI
      // clasic maximum finding + storing processMax and possition
      for( int i = 0; i < 2*numOfProcesses; i = i+2 )
      {
        if( recvData[ i+1 ] != -1 )
          if( Maximum < recvData[ i ] )
          {
            Maximum = recvData[ i ];
            ProcessMax = TNL::floor( i/2 );
            Possition = recvData[ i+1 ];
          }
      }
      if( verbose > 1 && processID == 0 )
        printf("%d: max = %.2f, possition = %d, process = %d\n", colPointerMain, Maximum, Possition, ProcessMax );
      // All processes has the info in Maximum, ProcesMax and Possition. So deleting arrays.
      delete []data;
      delete []recvData;
#endif
      // Clasic Maximum == 0 then we occured zero pivot so ending this calculation
      if( Maximum == 0 )
      {
        std::cout << "Ooops zero pivot occured in " << colPointerMain << "-th step." << std::endl;
        return false;
      }
      
      // Now when every process has the ProcessMax of pivoting row across all processes
      // we can send pivoting row to all processes from ProcessMax
      // mainRow stores pivoting row
      Real* mainRow;
      int size = this->A.getNumColumns() - colPointerMain + 1;
      mainRow = new Real[size];
    int blockSize = (this->A.getNumColumns()-colPointer) > 1024 ? 1024 : this->A.getNumColumns()-colPointer;
    int numBlocksOnRow = TNL::roundUpDivision( (this->A.getNumColumns()-colPointer), 1024 );
    int numOfBlocks =  this->A.getNumRows() * numBlocksOnRow;
    
    
      // If ProcessMax isn't the main process that contains colPointerMain then ProcessMax sets mainRow itself.
      // else means that ProcessMax is in the main process that contains colPointerMain, in this case we need to do normal pivoting
      // so it swaps rows and fills mainRow normally.
      if( ProcessMax != colPointerMain/this->A.getNumRows() )
      {
        if( processID == ProcessMax )
        {
          this->A.getRow( Possition, colPointerMain, mainRow, size );
          mainRow[ size-1 ] = this->b.getElement( Possition );
        }
      } else {
        if( colPointerMain/this->A.getNumRows() == processID ){
          if( Possition != colPointer )
    if( pivoting == "yes" )// && *pom != -1 && *pom != colPointer )
    {
      if( verbose > 1 )
      {
         std::cout << std::endl;
              std::cout << "Choosing element at " << Possition << "-th row as pivot with value..."  << std::endl;
              std::cout << "Swapping " << colPointer << "-th and " << Possition <<  "-th rows ... " << std::endl;
         std::cout << "Choosing element at " << *pom << "-th row as pivot with value..."  << std::endl;
         std::cout << "Swapping " << colPointer << "-th and " << *pom <<  "-th rows ... " << std::endl;
      }
            swapRows<<< numBlocksOnRow, blockSize >>>( devMat, device_vector.getView(), colPointer, numBlocksOnRow, Possition );
      swapRows<<< numBlocksOnRow, blockSize >>>( devMat, device_vector.getView(), colPointer, numBlocksOnRow, pivot );
      cudaDeviceSynchronize();
      TNL_CHECK_CUDA_DEVICE;
    } 
    
          this->A.getRow( colPointer, colPointerMain, mainRow, size );
          mainRow[ size-1 ] = this->b.getElement( colPointer );
        } 
      }
      
      // Broad casting the pivoting row to all processes
#ifdef HAVE_MPI
      MPI_Barrier(MPI_COMM_WORLD);
      TNL::Communicators::MpiCommunicator::Bcast( mainRow, size, ProcessMax, MPI_COMM_WORLD);
      //if( colPointerMain%100 == 0 )
      //  saveVec( mainRow, size, processID, colPointerMain );
      if( verbose > 1 )
      {
        printf( "%d: [", processID);
        for( int i = 0; i < size; i++ )
          printf( "%.2f ", mainRow[ i ] );
        printf("]\n");
      }
      
      
      // Onec more if the ProcessMax filled the mainRow, then the ProcessMax needs to switch this pivoting row with main process.
      // mainRowSwap is the colPointer of process colPointerMain/this->A.getNumRows()
      if( ProcessMax != colPointerMain/this->A.getNumRows() )
      {
        Real *mainRowSwap;
        mainRowSwap = new Real[size];
        
        
        if( processID == ProcessMax )
        {
          TNL::Communicators::MpiCommunicator::Recv( mainRowSwap, size, colPointerMain/this->A.getNumRows(), 0 );
          this->A.setRow( Possition, colPointerMain, mainRowSwap, size );
          this->b.setElement( Possition, mainRowSwap[ size-1 ] );
        }
        else if( processID == colPointerMain/this->A.getNumRows() )
        {
          this->A.getRow( colPointer, colPointerMain, mainRowSwap, size );
          mainRowSwap[ size-1 ] = this->b.getElement( colPointer );
          
          TNL::Communicators::MpiCommunicator::Send( mainRowSwap, size, ProcessMax, 0 );
          this->A.setRow( colPointer, colPointerMain, mainRow, size );
          this->b.setElement( colPointer, mainRow[ size-1 ] );
        }    
        delete []mainRowSwap;
      }
#endif
      // Main kernel works with vector as a main row, so all processes has to set mainRowVec.
      TNL::Containers::Vector< Real, TNL::Devices::Host, Index > mainRowVecHost( mainRow, size );
      mainRowVec = mainRowVecHost;
      delete []mainRow; 
    }
    else // without pivoting
    {
#ifdef HAVE_MPI
      Real* mainRow;
      int size = this->A.getNumColumns() - colPointerMain + 1;
      mainRow = new Real[size];
      
      if( colPointerMain/this->A.getNumRows() == processID ){
        this->A.getRow( colPointer, colPointerMain, mainRow, size );
        mainRow[ size-1 ] = this->b.getElement( colPointer );
      } 
      
      TNL::Communicators::MpiCommunicator::Bcast( mainRow, size, colPointerMain/this->A.getNumRows(), MPI_COMM_WORLD);
      
      if( verbose > 2 )
      {
        printf( "%d: [", processID);
        for( int i = 0; i < size; i++ )
          printf( "%.2f ", mainRow[ i ] );
        printf("]\n");
      }
      
      TNL::Containers::Vector< Real, TNL::Devices::Host, Index > mainRowVecHost( mainRow, size );
      mainRowVec = mainRowVecHost;
      delete []mainRow;
#else
      this->A.getRow(colPointer, colPointerMain, mainRowVec );
      mainRowVec.setElement( mainRowVec.getSize() - 1, this->b.getElement( colPointer ) );
#endif
    }
    if( verbose > 1 )
    {
#ifdef HAVE_MPI
      for( int i = 0; i < numOfProcesses; i++ )
      {
        if( processID == i )
        {
          std::cout << mainRowVec << std::endl;
        }
        MPI_Barrier(MPI_COMM_WORLD);
      }
#else
      std::cout << mainRowVec << std::endl;
#endif
    }
    if( verbose > 1 )
      printf( "Elimination: %d/%d\n", colPointerMain, this->A.getNumColumns() );
    //std::cout << mainRowVec << std::endl;
    // Finally main kernel calculates the GEM for colPointerMain from mainRowVec
    GEMmainKernel<<< numOfBlocks, blockSize >>>( devMat, 
                                                  device_vector.getView(), 
            mainRowVec.getView(),
                                                  colPointer, 
            colPointerMain,
            numBlocksOnRow,
            processID,
            numOfProcesses );
    cudaDeviceSynchronize();
    TNL_CHECK_CUDA_DEVICE;
    
    
    if( verbose > 2 )
    {
      #ifdef HAVE_MPI
    for( int i = 0; i < numOfProcesses; i++ )
    {
      if( processID == i )
      {
        showMatrix<<< 1, 1 >>>( this->A );
        cudaDeviceSynchronize();
        TNL_CHECK_CUDA_DEVICE;
      }
      MPI_Barrier(MPI_COMM_WORLD);
    }
    for( int i = 0; i < numOfProcesses; i++ )
    {
      if( i == processID )
        std::cout << device_vector;
      MPI_Barrier(MPI_COMM_WORLD);
    }
    std::cout << std::endl;
#else
    showMatrix<<< 1, 1 >>>( this->A );
                                                  numBlocksOnRow );
    cudaDeviceSynchronize();
    TNL_CHECK_CUDA_DEVICE;
    std::cout << this->b << std::endl;
    printf("\n");
#endif
    }
    // increment colPointerMain for next while passage
    colPointerMain++;
  }
  
  // delete all used variables
  cudaFree(pivot);
  cudaFree( devMat );
  TNL_CHECK_CUDA_DEVICE;
  free(pom);
  
  calculResultVector( this->A, device_vector, x );
  
  // Calculate real result 
  // (With MPI needs to send info into process 0 as main process with real result, rest processes has result as vector of zeros)
  calculResultVector( this->A, device_vector, x, processID, numOfProcesses );
  //if( processID == 0 )
  //  std::cout << x << std::endl;
  return true;
}

Loading