Loading src/Benchmarks/GEM/Makefile +7 −7 Original line number Diff line number Diff line include Makefile.inc SUBDIRS = HEADERS = "./TNL/src/TNL" HEADERS = "./TNL/tnl-dev/src/TNL" SOURCES = tml-gem.cpp CUDA_SOURCES = tnl-gem-cuda.cu TARGETS = tnl-gem Loading Loading @@ -41,14 +41,14 @@ clean_curdir: dist: clean tar zcvf $(PROJECT_NAME)-src.tgz $(SUBDIRS) $(FILES) #$(TARGETS): % : %.o # $(CXX) $(LDFLAGS) -L /home/maty/.openmpi/lib -lmpi -o $@ $< $(LDLIBS) $(TARGETS): % : %.o $(CXX) $(LDFLAGS) -o $@ $< $(LDLIBS) $(CUDA_TARGETS): % : %.cu.o $(CUDA_CXX) $(CUDA_LDFLAGS) -I /home/maty/.openmpi/include -L /home/maty/.openmpi/lib -lmpi -o $@ $< $(CUDA_LDLIBS) $(CUDA_CXX) $(CUDA_LDFLAGS) $(MPI_FLAGS) $(MPI_FLAGT) -o $@ $< $(CUDA_LDLIBS) #$(SOURCES:%.cpp=%.o): %.o: %.cpp # $(CXX) $(CPPFLAGS) $(CXXFLAGS) -c -o $@ $< $(SOURCES:%.cpp=%.o): %.o: %.cpp $(CXX) $(CPPFLAGS) $(CXXFLAGS) -c -o $@ $< $(CUDA_SOURCES:%.cu=%.cu.o): %.cu.o : %.cu $(CUDA_CXX) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) -I /home/maty/.openmpi/include -c -o $@ $< $(CUDA_CXX) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) $(MPI_FLAGS) -c -o $@ $< src/Benchmarks/GEM/Makefile.inc +10 −0 Original line number Diff line number Diff line Loading @@ -5,8 +5,11 @@ PROJECT_NAME = tnl-gem TNL_HEADERS = ${HOME}/.local/include INSTALL_DIR = ${HOME}/.local WITH_CUDA = yes WITH_MPI = yes WITH_OPENMP = yes WITH_DEBUG = yes MPI_FLAGT =-L/home/maty/.openmpi/lib -lmpi MPI_FLAGS =-I/home/maty/.openmpi/include # If TNL is installed on your system, CUDA arch can be detected automatically using # a tool 'tnl-cuda-arch'. This is done by default if CUDA_ARCH is set to 'auto'. Loading Loading @@ -39,6 +42,13 @@ ifeq ( $(WITH_CUDA), yes ) endif endif # Set-up MPI_FLAG ifeq ( $(WITH_MPI), yes ) MPI_FLAGT =-L/home/maty/.openmpi/lib -lmpi MPI_FLAGS =-I/home/maty/.openmpi/include endif # Set-up CPPFLAGS CPPFLAGS = -MD -MP Loading src/Benchmarks/GEM/Matrix/Matrix.h +1 −0 Original line number Diff line number Diff line Loading @@ -26,6 +26,7 @@ class Matrix Matrix(); Matrix( const Index rows, const Index columns ); Matrix( Matrix< Real, Device, Index>& matrix ); /** * Sets dimension for matrix of rows x columns elements and allocates memory Loading src/Benchmarks/GEM/Matrix/Matrix_impl.h +13 −8 Original line number Diff line number Diff line //#include "Matrix.h" #define DEBUG 0 template < typename Real, Loading @@ -17,6 +15,17 @@ Matrix< Real, Device, Index >::Matrix( Index rows, Index columns ) { this->setDimensions( rows, columns ); } template < typename Real, typename Device, typename Index > Matrix< Real, Device, Index >::Matrix( Matrix< Real, Device, Index>& matrix ) : rows( matrix.getNumRows() ), columns( matrix.getNumColumns() ) { matrix.getData( this->data ); } template < typename Real, typename Device, typename Index > Loading Loading @@ -164,9 +173,9 @@ template < typename Real, __cuda_callable__ void Matrix< Real, Device, Index >::showMatrix() { for( int i = 0; i < rows; i++ ) for( int i = 0; i < this->rows; i++ ) { for( int j = 0; j < columns; j++ ) for( int j = 0; j < this->columns; j++ ) printf("%.4f ", this->getElement(i,j)); printf("\n"); } Loading Loading @@ -217,21 +226,17 @@ Matrix< Real, Device, Index >::operator=( Matrix< Real, Device2, Index>& matrix matrix.getData( pom ); if( std::is_same< Device, Device2 >::value ) { printf("copy host to host or device to device\n"); this->data = pom; } #ifdef HAVE_CUDA else if( std::is_same< Device, TNL::Devices::Host >::value ) { printf("copy device to host\n"); for( int i = 0; i < this->rows; i++ ) for( int j = 0; j < this->columns; j++ ) this->setElement(i,j, pom[ i*TNL::roundToMultiple( this->rows, TNL::Cuda::getWarpSize() ) + j ] ); } else if( std::is_same< Device, TNL::Devices::Cuda >::value ) { printf("copy host to device\n"); printf("data size alocated: %d\n", this->data.getSize() ); for( int i = 0; i < this->getNumRows(); i++ ) for( int j = 0; j < this->getNumColumns(); j++ ) { Loading src/Benchmarks/GEM/gem.h +15 −55 Original line number Diff line number Diff line #include <chrono> #include <thread> #include "Matrix/Matrix.h" #include "gem/GEM.h" Loading Loading @@ -66,9 +68,6 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve Index rows, nonzeros; readMatrixVector( matrix, vector, matrixName, vectorName, rows, nonzeros, verbose ); VectorType vectorResult( rows ); vectorResult.setValue(0); MatrixType matrixComp; VectorType vectorComp( vector ); // Computation double* time; Loading @@ -77,51 +76,18 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve if( verbose > 1 ) cout << "Starting computation on " << device << endl; /*char hostname[256]; gethostname(hostname, sizeof(hostname)); printf("PID %d on %s ready for attach\n", getpid(), hostname);*/ for( int i = 0; i < loops; i++ ) { printf("process %d is waiting in barrier with i = %d, loops = %d\n", processID, i, loops ); #ifdef HAVE_MPI MPI_Barrier( MPI_COMM_WORLD ); #endif printf("process %d is behind barrier with i = %d, loops = %d and going to calculate next loop!\n", processID, i, loops ); printf("process %d is in loop %d from all loops %d\n", processID, i, loops ); #ifdef HAVE_CUDA printf( "%d process:\n", processID ); showMatrix<<< 1, 1 >>>( matrix ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; std::cout << vector << endl; cout << vectorResult << endl; #endif matrixComp = matrix; printf("%d: matrix coppied\n", processID ); #ifdef HAVE_MPI MPI_Barrier( MPI_COMM_WORLD ); #endif #ifdef HAVE_CUDA printf( "%d process:\n", processID ); showMatrix<<< 1, 1 >>>( matrixComp ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; #endif cout << processID << ":" << vector << endl; cout << processID << ":" << vectorComp << endl; #ifdef HAVE_MPI Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD ); #endif printf("%d:copiing Vector \n", processID); //vectorComp = vector; #ifdef HAVE_MPI Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD ); #endif //vectorResult.setValue( 0 ); #ifdef HAVE_MPI Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD ); #endif MatrixType matrixComp = matrix; VectorType vectorComp( vector ); vectorResult.setValue( 0 ); GEM< Real, Device, Index > gem( matrixComp, vectorComp ); #ifdef HAVE_MPI Loading @@ -135,9 +101,8 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve gem.solve( vectorResult, (String)"no", verbose ); duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC; #ifdef HAVE_MPI Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD ); #endif if( processID == 0 ) { time[i] = duration; Loading @@ -154,9 +119,6 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve printf( "Norm in %d calculation is %.4f\n", i+1, l2norm); } } #ifdef HAVE_MPI Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD ); #endif } if( verbose > 1 && processID == 0 ) printf("\n ... done!\n"); Loading @@ -176,14 +138,13 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve #ifdef HAVE_MPI Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD ); #endif if( processID == 0 ){ printf("%d: returning\n", processID ); //printf("%d: returning\n", processID ); return vectorResult; } else{ printf("%d: returning\n", processID ); //("%d: returning\n", processID ); vectorResult.setValue( 0 ); return vectorResult; } Loading Loading @@ -230,7 +191,6 @@ void readMatrixVector( Matrix< Real, Device, Index>& matrix, // Copy from CPU into matrix dependent on template Device vector = vectorHost; matrix = matrixHost; } template < typename Real, typename Index > Loading Loading @@ -297,13 +257,13 @@ void cutMatrixVectorMPI( Matrix< Real, Devices::Host, Index >& matrix, MPI_Comm_rank( MPI_COMM_WORLD, &processID ); MPI_Comm_size( MPI_COMM_WORLD, &numOfProcesses ); if( processID == 0 ) /*if( processID == 0 ) { printf( "%d: %d\n", numOfProcesses, processID ); matrix.showMatrix(); cout << vector << endl; } */ Index numRowsCUT = TNL::roundUpDivision( matrix.getNumRows(), numOfProcesses ); matrixTemp.setDimensions( numRowsCUT, matrix.getNumRows() ); Loading Loading
src/Benchmarks/GEM/Makefile +7 −7 Original line number Diff line number Diff line include Makefile.inc SUBDIRS = HEADERS = "./TNL/src/TNL" HEADERS = "./TNL/tnl-dev/src/TNL" SOURCES = tml-gem.cpp CUDA_SOURCES = tnl-gem-cuda.cu TARGETS = tnl-gem Loading Loading @@ -41,14 +41,14 @@ clean_curdir: dist: clean tar zcvf $(PROJECT_NAME)-src.tgz $(SUBDIRS) $(FILES) #$(TARGETS): % : %.o # $(CXX) $(LDFLAGS) -L /home/maty/.openmpi/lib -lmpi -o $@ $< $(LDLIBS) $(TARGETS): % : %.o $(CXX) $(LDFLAGS) -o $@ $< $(LDLIBS) $(CUDA_TARGETS): % : %.cu.o $(CUDA_CXX) $(CUDA_LDFLAGS) -I /home/maty/.openmpi/include -L /home/maty/.openmpi/lib -lmpi -o $@ $< $(CUDA_LDLIBS) $(CUDA_CXX) $(CUDA_LDFLAGS) $(MPI_FLAGS) $(MPI_FLAGT) -o $@ $< $(CUDA_LDLIBS) #$(SOURCES:%.cpp=%.o): %.o: %.cpp # $(CXX) $(CPPFLAGS) $(CXXFLAGS) -c -o $@ $< $(SOURCES:%.cpp=%.o): %.o: %.cpp $(CXX) $(CPPFLAGS) $(CXXFLAGS) -c -o $@ $< $(CUDA_SOURCES:%.cu=%.cu.o): %.cu.o : %.cu $(CUDA_CXX) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) -I /home/maty/.openmpi/include -c -o $@ $< $(CUDA_CXX) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) $(MPI_FLAGS) -c -o $@ $<
src/Benchmarks/GEM/Makefile.inc +10 −0 Original line number Diff line number Diff line Loading @@ -5,8 +5,11 @@ PROJECT_NAME = tnl-gem TNL_HEADERS = ${HOME}/.local/include INSTALL_DIR = ${HOME}/.local WITH_CUDA = yes WITH_MPI = yes WITH_OPENMP = yes WITH_DEBUG = yes MPI_FLAGT =-L/home/maty/.openmpi/lib -lmpi MPI_FLAGS =-I/home/maty/.openmpi/include # If TNL is installed on your system, CUDA arch can be detected automatically using # a tool 'tnl-cuda-arch'. This is done by default if CUDA_ARCH is set to 'auto'. Loading Loading @@ -39,6 +42,13 @@ ifeq ( $(WITH_CUDA), yes ) endif endif # Set-up MPI_FLAG ifeq ( $(WITH_MPI), yes ) MPI_FLAGT =-L/home/maty/.openmpi/lib -lmpi MPI_FLAGS =-I/home/maty/.openmpi/include endif # Set-up CPPFLAGS CPPFLAGS = -MD -MP Loading
src/Benchmarks/GEM/Matrix/Matrix.h +1 −0 Original line number Diff line number Diff line Loading @@ -26,6 +26,7 @@ class Matrix Matrix(); Matrix( const Index rows, const Index columns ); Matrix( Matrix< Real, Device, Index>& matrix ); /** * Sets dimension for matrix of rows x columns elements and allocates memory Loading
src/Benchmarks/GEM/Matrix/Matrix_impl.h +13 −8 Original line number Diff line number Diff line //#include "Matrix.h" #define DEBUG 0 template < typename Real, Loading @@ -17,6 +15,17 @@ Matrix< Real, Device, Index >::Matrix( Index rows, Index columns ) { this->setDimensions( rows, columns ); } template < typename Real, typename Device, typename Index > Matrix< Real, Device, Index >::Matrix( Matrix< Real, Device, Index>& matrix ) : rows( matrix.getNumRows() ), columns( matrix.getNumColumns() ) { matrix.getData( this->data ); } template < typename Real, typename Device, typename Index > Loading Loading @@ -164,9 +173,9 @@ template < typename Real, __cuda_callable__ void Matrix< Real, Device, Index >::showMatrix() { for( int i = 0; i < rows; i++ ) for( int i = 0; i < this->rows; i++ ) { for( int j = 0; j < columns; j++ ) for( int j = 0; j < this->columns; j++ ) printf("%.4f ", this->getElement(i,j)); printf("\n"); } Loading Loading @@ -217,21 +226,17 @@ Matrix< Real, Device, Index >::operator=( Matrix< Real, Device2, Index>& matrix matrix.getData( pom ); if( std::is_same< Device, Device2 >::value ) { printf("copy host to host or device to device\n"); this->data = pom; } #ifdef HAVE_CUDA else if( std::is_same< Device, TNL::Devices::Host >::value ) { printf("copy device to host\n"); for( int i = 0; i < this->rows; i++ ) for( int j = 0; j < this->columns; j++ ) this->setElement(i,j, pom[ i*TNL::roundToMultiple( this->rows, TNL::Cuda::getWarpSize() ) + j ] ); } else if( std::is_same< Device, TNL::Devices::Cuda >::value ) { printf("copy host to device\n"); printf("data size alocated: %d\n", this->data.getSize() ); for( int i = 0; i < this->getNumRows(); i++ ) for( int j = 0; j < this->getNumColumns(); j++ ) { Loading
src/Benchmarks/GEM/gem.h +15 −55 Original line number Diff line number Diff line #include <chrono> #include <thread> #include "Matrix/Matrix.h" #include "gem/GEM.h" Loading Loading @@ -66,9 +68,6 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve Index rows, nonzeros; readMatrixVector( matrix, vector, matrixName, vectorName, rows, nonzeros, verbose ); VectorType vectorResult( rows ); vectorResult.setValue(0); MatrixType matrixComp; VectorType vectorComp( vector ); // Computation double* time; Loading @@ -77,51 +76,18 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve if( verbose > 1 ) cout << "Starting computation on " << device << endl; /*char hostname[256]; gethostname(hostname, sizeof(hostname)); printf("PID %d on %s ready for attach\n", getpid(), hostname);*/ for( int i = 0; i < loops; i++ ) { printf("process %d is waiting in barrier with i = %d, loops = %d\n", processID, i, loops ); #ifdef HAVE_MPI MPI_Barrier( MPI_COMM_WORLD ); #endif printf("process %d is behind barrier with i = %d, loops = %d and going to calculate next loop!\n", processID, i, loops ); printf("process %d is in loop %d from all loops %d\n", processID, i, loops ); #ifdef HAVE_CUDA printf( "%d process:\n", processID ); showMatrix<<< 1, 1 >>>( matrix ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; std::cout << vector << endl; cout << vectorResult << endl; #endif matrixComp = matrix; printf("%d: matrix coppied\n", processID ); #ifdef HAVE_MPI MPI_Barrier( MPI_COMM_WORLD ); #endif #ifdef HAVE_CUDA printf( "%d process:\n", processID ); showMatrix<<< 1, 1 >>>( matrixComp ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; #endif cout << processID << ":" << vector << endl; cout << processID << ":" << vectorComp << endl; #ifdef HAVE_MPI Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD ); #endif printf("%d:copiing Vector \n", processID); //vectorComp = vector; #ifdef HAVE_MPI Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD ); #endif //vectorResult.setValue( 0 ); #ifdef HAVE_MPI Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD ); #endif MatrixType matrixComp = matrix; VectorType vectorComp( vector ); vectorResult.setValue( 0 ); GEM< Real, Device, Index > gem( matrixComp, vectorComp ); #ifdef HAVE_MPI Loading @@ -135,9 +101,8 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve gem.solve( vectorResult, (String)"no", verbose ); duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC; #ifdef HAVE_MPI Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD ); #endif if( processID == 0 ) { time[i] = duration; Loading @@ -154,9 +119,6 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve printf( "Norm in %d calculation is %.4f\n", i+1, l2norm); } } #ifdef HAVE_MPI Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD ); #endif } if( verbose > 1 && processID == 0 ) printf("\n ... done!\n"); Loading @@ -176,14 +138,13 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve #ifdef HAVE_MPI Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD ); #endif if( processID == 0 ){ printf("%d: returning\n", processID ); //printf("%d: returning\n", processID ); return vectorResult; } else{ printf("%d: returning\n", processID ); //("%d: returning\n", processID ); vectorResult.setValue( 0 ); return vectorResult; } Loading Loading @@ -230,7 +191,6 @@ void readMatrixVector( Matrix< Real, Device, Index>& matrix, // Copy from CPU into matrix dependent on template Device vector = vectorHost; matrix = matrixHost; } template < typename Real, typename Index > Loading Loading @@ -297,13 +257,13 @@ void cutMatrixVectorMPI( Matrix< Real, Devices::Host, Index >& matrix, MPI_Comm_rank( MPI_COMM_WORLD, &processID ); MPI_Comm_size( MPI_COMM_WORLD, &numOfProcesses ); if( processID == 0 ) /*if( processID == 0 ) { printf( "%d: %d\n", numOfProcesses, processID ); matrix.showMatrix(); cout << vector << endl; } */ Index numRowsCUT = TNL::roundUpDivision( matrix.getNumRows(), numOfProcesses ); matrixTemp.setDimensions( numRowsCUT, matrix.getNumRows() ); Loading