Commit dc689b13 authored by Matouš Fencl's avatar Matouš Fencl Committed by Jakub Klinkovský
Browse files

MPI without pivoting with unknown error depending on !!!loops!!!

parent a03b09c1
Loading
Loading
Loading
Loading
+6 −6
Original line number Diff line number Diff line
@@ -41,14 +41,14 @@ clean_curdir:
dist: clean
	tar zcvf $(PROJECT_NAME)-src.tgz $(SUBDIRS) $(FILES)

$(TARGETS): % : %.o
	$(CXX) $(LDFLAGS) -o $@ $< $(LDLIBS)
#$(TARGETS): % : %.o
#	$(CXX) $(LDFLAGS) -L /home/maty/.openmpi/lib -lmpi -o $@ $< $(LDLIBS)

$(CUDA_TARGETS): % : %.cu.o
	$(CUDA_CXX) $(CUDA_LDFLAGS) -o $@ $< $(CUDA_LDLIBS)
	$(CUDA_CXX) $(CUDA_LDFLAGS) -I /home/maty/.openmpi/include -L /home/maty/.openmpi/lib -lmpi -o $@ $< $(CUDA_LDLIBS)

$(SOURCES:%.cpp=%.o): %.o: %.cpp
	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -c -o $@ $<
#$(SOURCES:%.cpp=%.o): %.o: %.cpp
#	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -c -o $@ $<

$(CUDA_SOURCES:%.cu=%.cu.o): %.cu.o : %.cu
	$(CUDA_CXX) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) -c -o $@ $<
	$(CUDA_CXX) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) -I /home/maty/.openmpi/include -c -o $@ $<
+3 −2
Original line number Diff line number Diff line
@@ -17,6 +17,7 @@ CUDA_ARCH = auto
# Set-up compilers
CXX = g++
CUDA_CXX =/usr/local/cuda-10.1/bin/nvcc
#CUDA_CXX =/home/maty/.openmpi/bin/mpicc

# Set-up CXX_FLAGS
CXXFLAGS = -pthread -Wall -Wno-unused-local-typedefs -Wno-unused-variable -Wno-unknown-pragmas -std=c++14 -I$(TNL_HEADERS)
@@ -27,10 +28,10 @@ else
endif

# Set-up CUDA_CXXFLAGS
CUDA_CXXFLAGS =-Wno-deprecated-gpu-targets --expt-relaxed-constexpr --expt-extended-lambda -DHAVE_CUDA --std c++14 -I$(TNL_HEADERS)
CUDA_CXXFLAGS =-Wno-deprecated-gpu-targets --expt-relaxed-constexpr --expt-extended-lambda -DHAVE_CUDA -DHAVE_MPI --std c++14 -I$(TNL_HEADERS)

ifeq ( $(WITH_CUDA), yes )
   CUDA_CXXFLAGS += -DHAVE_CUDA
   CUDA_CXXFLAGS += -DHAVE_CUDA -DHAVE_MPI
   ifeq ( $(CUDA_ARCH), auto )
      CUDA_CXXFLAGS += `tnl-cuda-arch`
   else
+23 −3
Original line number Diff line number Diff line
@@ -51,7 +51,7 @@ class Matrix
    void setElement( Index row, Index col, Real value );
    
    /**
    * Returns element on row and column but on device that it's saved! Cannot be
    * Returns element on row and column. Cannot be
    * called from host if the matrix is on device and also cannot be called from
    * device if the matrix is on host.
    *
@@ -60,11 +60,31 @@ class Matrix
    */
    __cuda_callable__ Real getElement( Index row, Index col ) const;
    
    /**
    * Returns ROW on row and starting column. Cannot be
    * called from host if the matrix is on device and also cannot be called from
    * device if the matrix is on host.
    *
    * @param row and column, mainRow is array with size to be filled with values.
    * @return void.
    */
    void getRow( Index row, Index col, Real* mainRow, Index size );
    
    /**
    * Returns ROW on row and starting column. Can be
    * called from host if the matrix is on device and also can be called from
    * device if the matrix is on host. Values are saved in vector mainRow
    *
    * @param row and column, mainRow is output vector.
    * @return void
    */
    void getRow( Index row, Index col, Vector& mainRow );
    
    /**
    * Saves data array into array parameter even in cross device function matrix.
    * Data array is huge on device for coalesced memory access condition! 
    *
    * @param VectorView eather on host or device, depends where we want to store the data.
    * @param Vector eather on host or device, depends where we want to store the data.
    * @return void
    */
    template < typename Device1 >
+57 −8
Original line number Diff line number Diff line

//#include "Matrix.h"

#define DEBUG 0

template < typename Real,
@@ -92,12 +95,65 @@ Real Matrix< Real, Device, Index >::getElement( Index row, Index col ) const
  return 1;
}

template < typename Real,
        typename Device,
        typename Index >
void Matrix< Real, Device, Index >::getRow( Index row, Index col, Real* mainRow, Index size )
{
  TNL_ASSERT( row > -1 && col > -1, std::cerr << "Matrix cannot have egative row nor negative column!");
  TNL_ASSERT( row < rows && col < columns, std::cerr << "Matrix dosn't have that much rows or columns!");
  if( std::is_same< Device, TNL::Devices::Host >::value )
  {
#if DEBUG
    printf("On CPU\n");
#endif
    for( int i = 0; i < size-1; i++ )
      mainRow[ i ] = this->getElement( row, col + i ); 
  }
#ifdef HAVE_CUDA
  if( std::is_same< Device, TNL::Devices::Cuda >::value )
  {
    for( int i = 0; i < size-1; i++ )
      mainRow[ i ] = this->data.getElement( row*TNL::roundToMultiple( this->columns, TNL::Cuda::getWarpSize() ) + col + i );
  }
#endif
}

template < typename Real,
        typename Device,
        typename Index >
void Matrix< Real, Device, Index >::getRow( Index row, Index col, Vector& mainRow )
{
  TNL_ASSERT( row > -1 && col > -1, std::cerr << "Matrix cannot have egative row nor negative column!");
  TNL_ASSERT( row < rows && col < columns, std::cerr << "Matrix dosn't have that much rows or columns!");
  if( std::is_same< Device, TNL::Devices::Host >::value )
  {
#if DEBUG
    printf("On CPU\n");
#endif
    for( int i = 0; i < mainRow.getSize()-1; i++ )
      mainRow[ i ] = this->getElement( row, col + i ); 
  }
#ifdef HAVE_CUDA
  if( std::is_same< Device, TNL::Devices::Cuda >::value )
  {
    TNL::Containers::Vector< Real, TNL::Devices::Host, Index > tempVec( mainRow.getSize() );
    for( int i = 0; i < mainRow.getSize()-1; i++ )
      tempVec[ i ] = this->data.getElement( row*TNL::roundToMultiple( this->columns, TNL::Cuda::getWarpSize() ) + col + i );
    mainRow = tempVec;
  }
#endif
}

template < typename Real,
        typename Device,
        typename Index >
template < typename Device1 >
void Matrix< Real, Device, Index >::getData( TNL::Containers::Vector< Real, Device1, Index >& dataOut )
{
  /*dataOut.setSize( this->data.getSize() );
  for( int i = 0; i < this->data.getSize(); i++ )
    dataOut.setElement( i , this->data.getElement( i ));*/
  dataOut = this->data;
}

@@ -161,31 +217,24 @@ Matrix< Real, Device, Index >::operator=( Matrix< Real, Device2, Index>& matrix
  matrix.getData( pom );
  if( std::is_same< Device, Device2 >::value )
  {
    //printf("copy host to host or device to device\n");
    printf("copy host to host or device to device\n");
    this->data = pom; 
  }
#ifdef HAVE_CUDA
  else if( std::is_same< Device, TNL::Devices::Host >::value )
  {
#if DEBUG
    printf("copy device to host\n");
#endif
    for( int i = 0; i < this->rows; i++ )
      for( int j = 0; j < this->columns; j++ )
        this->setElement(i,j, pom[ i*TNL::roundToMultiple( this->rows, TNL::Cuda::getWarpSize() ) + j ] );
  } 
  else if( std::is_same< Device, TNL::Devices::Cuda >::value )
  {
#if DEBUG
    printf("copy host to device\n");
    printf("data size alocated: %d\n", this->data.getSize() );
#endif
    for( int i = 0; i < this->getNumRows(); i++ )
      for( int j = 0; j < this->getNumColumns(); j++ )
      {
#if DEBUG
        printf("copying element i,j = %d, %d with value %.4f\n", i, j, pom.getElement( i * this->columns + j ) );
#endif
        this->data.setElement( i*TNL::roundToMultiple( this->columns, TNL::Cuda::getWarpSize() ) + j,
                pom.getElement( i * this->columns + j ) );
      }
+199 −53
Original line number Diff line number Diff line
@@ -11,9 +11,15 @@
#include "TNL/tnl-dev/src/TNL/Cuda/MemoryHelpers.h"
#include "TNL/tnl-dev/src/TNL/Devices/Host.h"

#ifdef HAVE_MPI
#include "TNL/tnl-dev/src/TNL/Communicators/MpiCommunicator.h"
#include <mpi.h>
#endif

#include "gem/GEMkernels.h"
#include "TNL/Cuda/MemoryHelpers.h"
#define COMPARE_RESULTS true;

// Prihodit GEMdeice do GEM a dělat dělení na device až tam -> přidat parametr pivoting

using namespace TNL;
using namespace TNL::Containers;
@@ -24,6 +30,12 @@ template < typename Real, typename Index >
void calculHostVecOne( Matrix< Real, Devices::Host, Index >& matrix,
        Vector< Real, Devices::Host, Index >& vector, const String& vectorName );

#ifdef HAVE_MPI
template< typename Real, typename Index >
void cutMatrixVectorMPI( Matrix< Real, Devices::Host, Index >& matrix,
        Vector< Real, Devices::Host, Index >& vector );
#endif

template < typename Real, typename Index >
void readVector( Vector< Real, Devices::Host, Index >& vector_host, const String& vectorName );

@@ -31,7 +43,8 @@ void readVector( Vector< Real, Devices::Host, Index >& vector_host, const String
template < typename Real, typename Index, typename Device >
void readMatrixVector( Matrix< Real, Device, Index>& matrix, 
        Vector< Real, Device, Index >& vector,
        const String& matrixName,  const String& vectorName, const int verbose );
        const String& matrixName,  const String& vectorName,
        Index &rows, Index &nonzeros, const int verbose );


template < typename Real,
@@ -42,12 +55,20 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve
{  
  typedef Matrix< Real, Device, Index > MatrixType;
  typedef Vector< Real, Device, Index > VectorType;
  int processID = 0; // MPI processID, without mpi == 0
#ifdef HAVE_MPI
  MPI_Comm_rank( MPI_COMM_WORLD, &processID );
  Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD );
#endif
  
  MatrixType matrix;
  VectorType vector;
  
  readMatrixVector( matrix, vector, matrixName, vectorName, verbose );
  VectorType vectorResult( matrix.getNumRows() );
  Index rows, nonzeros;
  readMatrixVector( matrix, vector, matrixName, vectorName, rows, nonzeros, verbose );
  VectorType vectorResult( rows );
  vectorResult.setValue(0);
  MatrixType matrixComp;
  VectorType vectorComp( vector );
  
  // Computation
  double* time;
@@ -58,22 +79,67 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve
    cout << "Starting computation on " << device << endl;
  for( int i = 0; i < loops; i++ )
  {
    MatrixType matrixComp = matrix;
    VectorType vectorComp( vector );
    vectorResult.setValue( 0 );
    
    printf("process %d is waiting in barrier with i = %d, loops = %d\n", processID, i, loops );
#ifdef HAVE_MPI
    MPI_Barrier( MPI_COMM_WORLD );
#endif 
    printf("process %d is behind barrier with i = %d, loops = %d and going to calculate next loop!\n", processID, i, loops );
    printf("process %d is in loop %d from all loops %d\n", processID, i, loops );
#ifdef HAVE_CUDA
    printf( "%d process:\n", processID );
    showMatrix<<< 1, 1 >>>( matrix );
    cudaDeviceSynchronize();
    TNL_CHECK_CUDA_DEVICE;
    std::cout << vector << endl;
    cout << vectorResult << endl;
#endif
    
    matrixComp = matrix;
    
    printf("%d: matrix coppied\n", processID );
#ifdef HAVE_MPI
    MPI_Barrier( MPI_COMM_WORLD );
#endif 
#ifdef HAVE_CUDA
    printf( "%d process:\n", processID );
    showMatrix<<< 1, 1 >>>( matrixComp );
    cudaDeviceSynchronize();
    TNL_CHECK_CUDA_DEVICE;
#endif
    cout << processID << ":" << vector << endl;
    cout << processID << ":" << vectorComp << endl;
#ifdef HAVE_MPI
    Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD );
#endif 
    printf("%d:copiing Vector \n", processID);
    //vectorComp = vector;
    
#ifdef HAVE_MPI
    Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD );
#endif 
    //vectorResult.setValue( 0 );
    
#ifdef HAVE_MPI
    Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD );
#endif 
    GEM< Real, Device, Index > gem( matrixComp, vectorComp );
    
    std::clock_t start;
#ifdef HAVE_MPI
    Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD );
#endif 
    double duration;

    std::clock_t start;
    start = std::clock();
    
    if( verbose > 1 )
    if( verbose > 1 && processID == 0 )
      cout << "starting computation number " << i+1 << endl;
    gem.solve( vectorResult, pivoting, verbose );
    
    gem.solve( vectorResult, (String)"no", verbose );
    duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC;
#ifdef HAVE_MPI
    Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD );
#endif 
    if( processID == 0 )
    {
      time[i] = duration;
      
      if( vectorName == "none" )
@@ -88,7 +154,11 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve
          printf( "Norm in %d calculation is %.4f\n", i+1, l2norm);
      }
    }
  if( verbose > 1 )
#ifdef HAVE_MPI
    Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD );
#endif 
  }
  if( verbose > 1 && processID == 0 )
    printf("\n ... done!\n");
  
  double timeMean = 0;
@@ -96,19 +166,34 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve
    timeMean += time[i];
  timeMean /= loops;
  
  if( processID == 0 )
    printf("%20s %15d %15d %20s %15s %15s %10d %15.5f %15.5f\n", matrixName.c_str(),
          matrix.getNumRows(), matrix.getNumNonzeros(), vectorName == "none" ? "-":vectorName.c_str(),
            rows, nonzeros, vectorName == "none" ? "-":vectorName.c_str(),
            device.c_str(), typeid(Real).name() == (string)"f" ? "float":"double", loops, timeMean, error);
  
  delete []time;
  
#ifdef HAVE_MPI
  Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD );
#endif 
  
  if( processID == 0 ){
    printf("%d: returning\n", processID );
    return vectorResult;
    
  }
  else{
    printf("%d: returning\n", processID );
    vectorResult.setValue( 0 );
    return vectorResult;
  }
}

template < typename Real, typename Index, typename Device >
void readMatrixVector( Matrix< Real, Device, Index>& matrix, 
        Vector< Real, Device, Index >& vector,
        const String& matrixName,  const String& vectorName, const int verbose )
        const String& matrixName,  const String& vectorName, 
        Index &rows, Index &nonzeros, const int verbose )
{
  typedef Matrix< Real, Devices::Host, Index> MatrixHost;
  MatrixHost matrixHost;
@@ -117,9 +202,9 @@ void readMatrixVector( Matrix< Real, Device, Index>& matrix,
  // Get matrix
  Matrices::MatrixReader< MatrixHost > m;
  m.readMtxFile( "./test-matrices/" + matrixName, matrixHost, verbose );
  if( verbose > 2 )
    matrixHost.showMatrix();
  matrix = matrixHost;
  rows = matrixHost.getNumRows();
  nonzeros = matrixHost.getNumNonzeros();
  
  
  // Get vector
  Vector< Real, Devices::Host, Index > vectorHost( matrixHost.getNumRows() );
@@ -132,9 +217,19 @@ void readMatrixVector( Matrix< Real, Device, Index>& matrix,
    if( verbose > 1 )
      cout << "reading vector " << vectorName << endl;
  }
  
#ifdef HAVE_MPI
  cutMatrixVectorMPI( matrixHost, vectorHost );
#endif
  if( verbose > 2 )
    matrixHost.showMatrix();
  
  if( verbose > 2 )
    cout << vectorHost << endl;
  
  // Copy from CPU into matrix dependent on template Device
  vector = vectorHost;
  matrix = matrixHost;
  
}

@@ -186,3 +281,54 @@ void readVector( Vector< Real, Devices::Host, Index >& vector_host, const String
  }
  inFile.close();
}


#ifdef HAVE_MPI
template< typename Real, typename Index >
void cutMatrixVectorMPI( Matrix< Real, Devices::Host, Index >& matrix,
        Vector< Real, Devices::Host, Index >& vector )
{
  Matrix< Real, Devices::Host, Index> matrixTemp;
  Vector< Real, Devices::Host, Index > vectorTemp;
  
  int processID;
  int numOfProcesses;
  
  MPI_Comm_rank( MPI_COMM_WORLD, &processID );
  MPI_Comm_size( MPI_COMM_WORLD, &numOfProcesses );
  
  if( processID == 0 )
  {
    printf( "%d: %d\n", numOfProcesses, processID );
    matrix.showMatrix();
    cout << vector << endl;
  }
  
  
  Index numRowsCUT = TNL::roundUpDivision( matrix.getNumRows(), numOfProcesses );
  matrixTemp.setDimensions( numRowsCUT, matrix.getNumRows() );
  vectorTemp.setSize( numRowsCUT );
  
  //printf( "%d: %d num of rows = %d\n", numOfProcesses, processID, numRowsCUT );
  
  for( int j = 0; j < matrix.getNumColumns(); j++ ){
    for( int i = 0; i < numRowsCUT; i++ ){
      if( i + numRowsCUT * processID < matrix.getNumRows() ){
        matrixTemp.setElement( i, j, matrix.getElement( i + numRowsCUT * processID, j ) );
      }else{
        matrixTemp.setElement(i,j,0);
      }
    }
  }
  for( int i = 0; i < numRowsCUT; i++ ){
    if( i + numRowsCUT * processID < matrix.getNumRows() ){
      vectorTemp.setElement( i, vector.getElement( i + numRowsCUT * processID ) );
    }else{
      vectorTemp.setElement( i, 0 );
    }
  }
  
  matrix = matrixTemp;
  vector = vectorTemp;
}
#endif
 No newline at end of file
Loading