Commit 519f8715 authored by Matouš Fencl's avatar Matouš Fencl Committed by Jakub Klinkovský
Browse files

Error repaired, cuda MPI without pivoting working

parent dc689b13
Loading
Loading
Loading
Loading
+7 −7
Original line number Diff line number Diff line
include Makefile.inc

SUBDIRS = 
HEADERS = "./TNL/src/TNL"
HEADERS = "./TNL/tnl-dev/src/TNL"
SOURCES = tml-gem.cpp
CUDA_SOURCES = tnl-gem-cuda.cu
TARGETS = tnl-gem 
@@ -41,14 +41,14 @@ clean_curdir:
dist: clean
	tar zcvf $(PROJECT_NAME)-src.tgz $(SUBDIRS) $(FILES)

#$(TARGETS): % : %.o
#	$(CXX) $(LDFLAGS) -L /home/maty/.openmpi/lib -lmpi -o $@ $< $(LDLIBS)
$(TARGETS): % : %.o
	$(CXX) $(LDFLAGS) -o $@ $< $(LDLIBS)

$(CUDA_TARGETS): % : %.cu.o
	$(CUDA_CXX) $(CUDA_LDFLAGS) -I /home/maty/.openmpi/include -L /home/maty/.openmpi/lib -lmpi -o $@ $< $(CUDA_LDLIBS)
	$(CUDA_CXX) $(CUDA_LDFLAGS) $(MPI_FLAGS) $(MPI_FLAGT) -o $@ $< $(CUDA_LDLIBS)

#$(SOURCES:%.cpp=%.o): %.o: %.cpp
#	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -c -o $@ $<
$(SOURCES:%.cpp=%.o): %.o: %.cpp
	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -c -o $@ $<

$(CUDA_SOURCES:%.cu=%.cu.o): %.cu.o : %.cu
	$(CUDA_CXX) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) -I /home/maty/.openmpi/include -c -o $@ $<
	$(CUDA_CXX) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) $(MPI_FLAGS) -c -o $@ $<
+10 −0
Original line number Diff line number Diff line
@@ -5,8 +5,11 @@ PROJECT_NAME = tnl-gem
TNL_HEADERS = ${HOME}/.local/include
INSTALL_DIR = ${HOME}/.local
WITH_CUDA = yes
WITH_MPI = yes
WITH_OPENMP = yes
WITH_DEBUG = yes
MPI_FLAGT =-L/home/maty/.openmpi/lib -lmpi
MPI_FLAGS =-I/home/maty/.openmpi/include

# If TNL is installed on your system, CUDA arch can be detected automatically using
# a tool 'tnl-cuda-arch'. This is done by default if CUDA_ARCH is set to 'auto'. 
@@ -39,6 +42,13 @@ ifeq ( $(WITH_CUDA), yes )
   endif
endif


# Set-up MPI_FLAG
ifeq ( $(WITH_MPI), yes )
	MPI_FLAGT =-L/home/maty/.openmpi/lib -lmpi
	MPI_FLAGS =-I/home/maty/.openmpi/include
endif

# Set-up CPPFLAGS
CPPFLAGS = -MD -MP

+1 −0
Original line number Diff line number Diff line
@@ -26,6 +26,7 @@ class Matrix
    
    Matrix();
    Matrix( const Index rows, const Index columns );
    Matrix( Matrix< Real, Device, Index>& matrix );
    
    /**
    * Sets dimension for matrix of rows x columns elements and allocates memory
+13 −8
Original line number Diff line number Diff line

//#include "Matrix.h"

#define DEBUG 0

template < typename Real,
@@ -17,6 +15,17 @@ Matrix< Real, Device, Index >::Matrix( Index rows, Index columns )
{ 
  this->setDimensions( rows, columns );  
}

template < typename Real,
        typename Device,
        typename Index >
Matrix< Real, Device, Index >::Matrix( Matrix< Real, Device, Index>& matrix )
: rows( matrix.getNumRows() ), columns( matrix.getNumColumns() )
{
  matrix.getData( this->data );
}


template < typename Real,
        typename Device,
        typename Index >
@@ -164,9 +173,9 @@ template < typename Real,
__cuda_callable__
void Matrix< Real, Device, Index >::showMatrix()
{
  for( int i = 0; i < rows; i++ )
  for( int i = 0; i < this->rows; i++ )
  {
    for( int j = 0; j < columns; j++ )
    for( int j = 0; j < this->columns; j++ )
      printf("%.4f ", this->getElement(i,j));
    printf("\n");
  }
@@ -217,21 +226,17 @@ Matrix< Real, Device, Index >::operator=( Matrix< Real, Device2, Index>& matrix
  matrix.getData( pom );
  if( std::is_same< Device, Device2 >::value )
  {
    printf("copy host to host or device to device\n");
    this->data = pom; 
  }
#ifdef HAVE_CUDA
  else if( std::is_same< Device, TNL::Devices::Host >::value )
  {
    printf("copy device to host\n");
    for( int i = 0; i < this->rows; i++ )
      for( int j = 0; j < this->columns; j++ )
        this->setElement(i,j, pom[ i*TNL::roundToMultiple( this->rows, TNL::Cuda::getWarpSize() ) + j ] );
  } 
  else if( std::is_same< Device, TNL::Devices::Cuda >::value )
  {
    printf("copy host to device\n");
    printf("data size alocated: %d\n", this->data.getSize() );
    for( int i = 0; i < this->getNumRows(); i++ )
      for( int j = 0; j < this->getNumColumns(); j++ )
      {
+15 −55
Original line number Diff line number Diff line
#include <chrono> 
#include <thread> 

#include "Matrix/Matrix.h"
#include "gem/GEM.h"
@@ -66,9 +68,6 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve
  Index rows, nonzeros;
  readMatrixVector( matrix, vector, matrixName, vectorName, rows, nonzeros, verbose );
  VectorType vectorResult( rows );
  vectorResult.setValue(0);
  MatrixType matrixComp;
  VectorType vectorComp( vector );
  
  // Computation
  double* time;
@@ -77,51 +76,18 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve
  
  if( verbose > 1 )
    cout << "Starting computation on " << device << endl;
  /*char hostname[256];
  gethostname(hostname, sizeof(hostname));
  printf("PID %d on %s ready for attach\n", getpid(), hostname);*/
  
  for( int i = 0; i < loops; i++ )
  {
    printf("process %d is waiting in barrier with i = %d, loops = %d\n", processID, i, loops );
#ifdef HAVE_MPI
    MPI_Barrier( MPI_COMM_WORLD );
#endif 
    printf("process %d is behind barrier with i = %d, loops = %d and going to calculate next loop!\n", processID, i, loops );
    printf("process %d is in loop %d from all loops %d\n", processID, i, loops );
#ifdef HAVE_CUDA
    printf( "%d process:\n", processID );
    showMatrix<<< 1, 1 >>>( matrix );
    cudaDeviceSynchronize();
    TNL_CHECK_CUDA_DEVICE;
    std::cout << vector << endl;
    cout << vectorResult << endl;
#endif
    
    matrixComp = matrix;
    
    printf("%d: matrix coppied\n", processID );
#ifdef HAVE_MPI
    MPI_Barrier( MPI_COMM_WORLD );
#endif 
#ifdef HAVE_CUDA
    printf( "%d process:\n", processID );
    showMatrix<<< 1, 1 >>>( matrixComp );
    cudaDeviceSynchronize();
    TNL_CHECK_CUDA_DEVICE;
#endif
    cout << processID << ":" << vector << endl;
    cout << processID << ":" << vectorComp << endl;
#ifdef HAVE_MPI
    Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD );
#endif 
    printf("%d:copiing Vector \n", processID);
    //vectorComp = vector;
    
#ifdef HAVE_MPI
    Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD );
#endif 
    //vectorResult.setValue( 0 );
    
#ifdef HAVE_MPI
    Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD );
#endif 
    MatrixType matrixComp = matrix;
    VectorType vectorComp( vector );
    vectorResult.setValue( 0 );
    GEM< Real, Device, Index > gem( matrixComp, vectorComp );
    
#ifdef HAVE_MPI
@@ -135,9 +101,8 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve
    
    gem.solve( vectorResult, (String)"no", verbose );
    duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC;
#ifdef HAVE_MPI
    Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD );
#endif 
    
    
    if( processID == 0 )
    {
      time[i] = duration;
@@ -154,9 +119,6 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve
          printf( "Norm in %d calculation is %.4f\n", i+1, l2norm);
      }
    }
#ifdef HAVE_MPI
    Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD );
#endif 
  }
  if( verbose > 1 && processID == 0 )
    printf("\n ... done!\n");
@@ -176,14 +138,13 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve
#ifdef HAVE_MPI
  Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD );
#endif 
  
  if( processID == 0 ){
    printf("%d: returning\n", processID );
    //printf("%d: returning\n", processID );
    return vectorResult;
    
  }
  else{
    printf("%d: returning\n", processID );
    //("%d: returning\n", processID );
    vectorResult.setValue( 0 );
    return vectorResult;
  }
@@ -230,7 +191,6 @@ void readMatrixVector( Matrix< Real, Device, Index>& matrix,
  // Copy from CPU into matrix dependent on template Device
  vector = vectorHost;
  matrix = matrixHost;
  
}

template < typename Real, typename Index >
@@ -297,13 +257,13 @@ void cutMatrixVectorMPI( Matrix< Real, Devices::Host, Index >& matrix,
  MPI_Comm_rank( MPI_COMM_WORLD, &processID );
  MPI_Comm_size( MPI_COMM_WORLD, &numOfProcesses );
  
  if( processID == 0 )
  /*if( processID == 0 )
  {
    printf( "%d: %d\n", numOfProcesses, processID );
    matrix.showMatrix();
    cout << vector << endl;
  }
  
  */
  
  Index numRowsCUT = TNL::roundUpDivision( matrix.getNumRows(), numOfProcesses );
  matrixTemp.setDimensions( numRowsCUT, matrix.getNumRows() );
Loading