Loading src/Benchmarks/GEM/Makefile +6 −6 Original line number Diff line number Diff line Loading @@ -41,14 +41,14 @@ clean_curdir: dist: clean tar zcvf $(PROJECT_NAME)-src.tgz $(SUBDIRS) $(FILES) $(TARGETS): % : %.o $(CXX) $(LDFLAGS) -o $@ $< $(LDLIBS) #$(TARGETS): % : %.o # $(CXX) $(LDFLAGS) -L /home/maty/.openmpi/lib -lmpi -o $@ $< $(LDLIBS) $(CUDA_TARGETS): % : %.cu.o $(CUDA_CXX) $(CUDA_LDFLAGS) -o $@ $< $(CUDA_LDLIBS) $(CUDA_CXX) $(CUDA_LDFLAGS) -I /home/maty/.openmpi/include -L /home/maty/.openmpi/lib -lmpi -o $@ $< $(CUDA_LDLIBS) $(SOURCES:%.cpp=%.o): %.o: %.cpp $(CXX) $(CPPFLAGS) $(CXXFLAGS) -c -o $@ $< #$(SOURCES:%.cpp=%.o): %.o: %.cpp # $(CXX) $(CPPFLAGS) $(CXXFLAGS) -c -o $@ $< $(CUDA_SOURCES:%.cu=%.cu.o): %.cu.o : %.cu $(CUDA_CXX) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) -c -o $@ $< $(CUDA_CXX) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) -I /home/maty/.openmpi/include -c -o $@ $< src/Benchmarks/GEM/Makefile.inc +3 −2 Original line number Diff line number Diff line Loading @@ -17,6 +17,7 @@ CUDA_ARCH = auto # Set-up compilers CXX = g++ CUDA_CXX =/usr/local/cuda-10.1/bin/nvcc #CUDA_CXX =/home/maty/.openmpi/bin/mpicc # Set-up CXX_FLAGS CXXFLAGS = -pthread -Wall -Wno-unused-local-typedefs -Wno-unused-variable -Wno-unknown-pragmas -std=c++14 -I$(TNL_HEADERS) Loading @@ -27,10 +28,10 @@ else endif # Set-up CUDA_CXXFLAGS CUDA_CXXFLAGS =-Wno-deprecated-gpu-targets --expt-relaxed-constexpr --expt-extended-lambda -DHAVE_CUDA --std c++14 -I$(TNL_HEADERS) CUDA_CXXFLAGS =-Wno-deprecated-gpu-targets --expt-relaxed-constexpr --expt-extended-lambda -DHAVE_CUDA -DHAVE_MPI --std c++14 -I$(TNL_HEADERS) ifeq ( $(WITH_CUDA), yes ) CUDA_CXXFLAGS += -DHAVE_CUDA CUDA_CXXFLAGS += -DHAVE_CUDA -DHAVE_MPI ifeq ( $(CUDA_ARCH), auto ) CUDA_CXXFLAGS += `tnl-cuda-arch` else Loading src/Benchmarks/GEM/Matrix/Matrix.h +23 −3 Original line number Diff line number Diff line Loading @@ -51,7 +51,7 @@ class Matrix void setElement( Index row, Index col, Real value ); /** * Returns element on row and column but on device that it's saved! Cannot be * Returns element on row and column. Cannot be * called from host if the matrix is on device and also cannot be called from * device if the matrix is on host. * Loading @@ -60,11 +60,31 @@ class Matrix */ __cuda_callable__ Real getElement( Index row, Index col ) const; /** * Returns ROW on row and starting column. Cannot be * called from host if the matrix is on device and also cannot be called from * device if the matrix is on host. * * @param row and column, mainRow is array with size to be filled with values. * @return void. */ void getRow( Index row, Index col, Real* mainRow, Index size ); /** * Returns ROW on row and starting column. Can be * called from host if the matrix is on device and also can be called from * device if the matrix is on host. Values are saved in vector mainRow * * @param row and column, mainRow is output vector. * @return void */ void getRow( Index row, Index col, Vector& mainRow ); /** * Saves data array into array parameter even in cross device function matrix. * Data array is huge on device for coalesced memory access condition! * * @param VectorView eather on host or device, depends where we want to store the data. * @param Vector eather on host or device, depends where we want to store the data. * @return void */ template < typename Device1 > Loading src/Benchmarks/GEM/Matrix/Matrix_impl.h +57 −8 Original line number Diff line number Diff line //#include "Matrix.h" #define DEBUG 0 template < typename Real, Loading Loading @@ -92,12 +95,65 @@ Real Matrix< Real, Device, Index >::getElement( Index row, Index col ) const return 1; } template < typename Real, typename Device, typename Index > void Matrix< Real, Device, Index >::getRow( Index row, Index col, Real* mainRow, Index size ) { TNL_ASSERT( row > -1 && col > -1, std::cerr << "Matrix cannot have egative row nor negative column!"); TNL_ASSERT( row < rows && col < columns, std::cerr << "Matrix dosn't have that much rows or columns!"); if( std::is_same< Device, TNL::Devices::Host >::value ) { #if DEBUG printf("On CPU\n"); #endif for( int i = 0; i < size-1; i++ ) mainRow[ i ] = this->getElement( row, col + i ); } #ifdef HAVE_CUDA if( std::is_same< Device, TNL::Devices::Cuda >::value ) { for( int i = 0; i < size-1; i++ ) mainRow[ i ] = this->data.getElement( row*TNL::roundToMultiple( this->columns, TNL::Cuda::getWarpSize() ) + col + i ); } #endif } template < typename Real, typename Device, typename Index > void Matrix< Real, Device, Index >::getRow( Index row, Index col, Vector& mainRow ) { TNL_ASSERT( row > -1 && col > -1, std::cerr << "Matrix cannot have egative row nor negative column!"); TNL_ASSERT( row < rows && col < columns, std::cerr << "Matrix dosn't have that much rows or columns!"); if( std::is_same< Device, TNL::Devices::Host >::value ) { #if DEBUG printf("On CPU\n"); #endif for( int i = 0; i < mainRow.getSize()-1; i++ ) mainRow[ i ] = this->getElement( row, col + i ); } #ifdef HAVE_CUDA if( std::is_same< Device, TNL::Devices::Cuda >::value ) { TNL::Containers::Vector< Real, TNL::Devices::Host, Index > tempVec( mainRow.getSize() ); for( int i = 0; i < mainRow.getSize()-1; i++ ) tempVec[ i ] = this->data.getElement( row*TNL::roundToMultiple( this->columns, TNL::Cuda::getWarpSize() ) + col + i ); mainRow = tempVec; } #endif } template < typename Real, typename Device, typename Index > template < typename Device1 > void Matrix< Real, Device, Index >::getData( TNL::Containers::Vector< Real, Device1, Index >& dataOut ) { /*dataOut.setSize( this->data.getSize() ); for( int i = 0; i < this->data.getSize(); i++ ) dataOut.setElement( i , this->data.getElement( i ));*/ dataOut = this->data; } Loading Loading @@ -161,31 +217,24 @@ Matrix< Real, Device, Index >::operator=( Matrix< Real, Device2, Index>& matrix matrix.getData( pom ); if( std::is_same< Device, Device2 >::value ) { //printf("copy host to host or device to device\n"); printf("copy host to host or device to device\n"); this->data = pom; } #ifdef HAVE_CUDA else if( std::is_same< Device, TNL::Devices::Host >::value ) { #if DEBUG printf("copy device to host\n"); #endif for( int i = 0; i < this->rows; i++ ) for( int j = 0; j < this->columns; j++ ) this->setElement(i,j, pom[ i*TNL::roundToMultiple( this->rows, TNL::Cuda::getWarpSize() ) + j ] ); } else if( std::is_same< Device, TNL::Devices::Cuda >::value ) { #if DEBUG printf("copy host to device\n"); printf("data size alocated: %d\n", this->data.getSize() ); #endif for( int i = 0; i < this->getNumRows(); i++ ) for( int j = 0; j < this->getNumColumns(); j++ ) { #if DEBUG printf("copying element i,j = %d, %d with value %.4f\n", i, j, pom.getElement( i * this->columns + j ) ); #endif this->data.setElement( i*TNL::roundToMultiple( this->columns, TNL::Cuda::getWarpSize() ) + j, pom.getElement( i * this->columns + j ) ); } Loading src/Benchmarks/GEM/gem.h +199 −53 Original line number Diff line number Diff line Loading @@ -11,9 +11,15 @@ #include "TNL/tnl-dev/src/TNL/Cuda/MemoryHelpers.h" #include "TNL/tnl-dev/src/TNL/Devices/Host.h" #ifdef HAVE_MPI #include "TNL/tnl-dev/src/TNL/Communicators/MpiCommunicator.h" #include <mpi.h> #endif #include "gem/GEMkernels.h" #include "TNL/Cuda/MemoryHelpers.h" #define COMPARE_RESULTS true; // Prihodit GEMdeice do GEM a dělat dělení na device až tam -> přidat parametr pivoting using namespace TNL; using namespace TNL::Containers; Loading @@ -24,6 +30,12 @@ template < typename Real, typename Index > void calculHostVecOne( Matrix< Real, Devices::Host, Index >& matrix, Vector< Real, Devices::Host, Index >& vector, const String& vectorName ); #ifdef HAVE_MPI template< typename Real, typename Index > void cutMatrixVectorMPI( Matrix< Real, Devices::Host, Index >& matrix, Vector< Real, Devices::Host, Index >& vector ); #endif template < typename Real, typename Index > void readVector( Vector< Real, Devices::Host, Index >& vector_host, const String& vectorName ); Loading @@ -31,7 +43,8 @@ void readVector( Vector< Real, Devices::Host, Index >& vector_host, const String template < typename Real, typename Index, typename Device > void readMatrixVector( Matrix< Real, Device, Index>& matrix, Vector< Real, Device, Index >& vector, const String& matrixName, const String& vectorName, const int verbose ); const String& matrixName, const String& vectorName, Index &rows, Index &nonzeros, const int verbose ); template < typename Real, Loading @@ -42,12 +55,20 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve { typedef Matrix< Real, Device, Index > MatrixType; typedef Vector< Real, Device, Index > VectorType; int processID = 0; // MPI processID, without mpi == 0 #ifdef HAVE_MPI MPI_Comm_rank( MPI_COMM_WORLD, &processID ); Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD ); #endif MatrixType matrix; VectorType vector; readMatrixVector( matrix, vector, matrixName, vectorName, verbose ); VectorType vectorResult( matrix.getNumRows() ); Index rows, nonzeros; readMatrixVector( matrix, vector, matrixName, vectorName, rows, nonzeros, verbose ); VectorType vectorResult( rows ); vectorResult.setValue(0); MatrixType matrixComp; VectorType vectorComp( vector ); // Computation double* time; Loading @@ -58,22 +79,67 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve cout << "Starting computation on " << device << endl; for( int i = 0; i < loops; i++ ) { MatrixType matrixComp = matrix; VectorType vectorComp( vector ); vectorResult.setValue( 0 ); printf("process %d is waiting in barrier with i = %d, loops = %d\n", processID, i, loops ); #ifdef HAVE_MPI MPI_Barrier( MPI_COMM_WORLD ); #endif printf("process %d is behind barrier with i = %d, loops = %d and going to calculate next loop!\n", processID, i, loops ); printf("process %d is in loop %d from all loops %d\n", processID, i, loops ); #ifdef HAVE_CUDA printf( "%d process:\n", processID ); showMatrix<<< 1, 1 >>>( matrix ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; std::cout << vector << endl; cout << vectorResult << endl; #endif matrixComp = matrix; printf("%d: matrix coppied\n", processID ); #ifdef HAVE_MPI MPI_Barrier( MPI_COMM_WORLD ); #endif #ifdef HAVE_CUDA printf( "%d process:\n", processID ); showMatrix<<< 1, 1 >>>( matrixComp ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; #endif cout << processID << ":" << vector << endl; cout << processID << ":" << vectorComp << endl; #ifdef HAVE_MPI Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD ); #endif printf("%d:copiing Vector \n", processID); //vectorComp = vector; #ifdef HAVE_MPI Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD ); #endif //vectorResult.setValue( 0 ); #ifdef HAVE_MPI Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD ); #endif GEM< Real, Device, Index > gem( matrixComp, vectorComp ); std::clock_t start; #ifdef HAVE_MPI Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD ); #endif double duration; std::clock_t start; start = std::clock(); if( verbose > 1 ) if( verbose > 1 && processID == 0 ) cout << "starting computation number " << i+1 << endl; gem.solve( vectorResult, pivoting, verbose ); gem.solve( vectorResult, (String)"no", verbose ); duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC; #ifdef HAVE_MPI Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD ); #endif if( processID == 0 ) { time[i] = duration; if( vectorName == "none" ) Loading @@ -88,7 +154,11 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve printf( "Norm in %d calculation is %.4f\n", i+1, l2norm); } } if( verbose > 1 ) #ifdef HAVE_MPI Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD ); #endif } if( verbose > 1 && processID == 0 ) printf("\n ... done!\n"); double timeMean = 0; Loading @@ -96,19 +166,34 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve timeMean += time[i]; timeMean /= loops; if( processID == 0 ) printf("%20s %15d %15d %20s %15s %15s %10d %15.5f %15.5f\n", matrixName.c_str(), matrix.getNumRows(), matrix.getNumNonzeros(), vectorName == "none" ? "-":vectorName.c_str(), rows, nonzeros, vectorName == "none" ? "-":vectorName.c_str(), device.c_str(), typeid(Real).name() == (string)"f" ? "float":"double", loops, timeMean, error); delete []time; #ifdef HAVE_MPI Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD ); #endif if( processID == 0 ){ printf("%d: returning\n", processID ); return vectorResult; } else{ printf("%d: returning\n", processID ); vectorResult.setValue( 0 ); return vectorResult; } } template < typename Real, typename Index, typename Device > void readMatrixVector( Matrix< Real, Device, Index>& matrix, Vector< Real, Device, Index >& vector, const String& matrixName, const String& vectorName, const int verbose ) const String& matrixName, const String& vectorName, Index &rows, Index &nonzeros, const int verbose ) { typedef Matrix< Real, Devices::Host, Index> MatrixHost; MatrixHost matrixHost; Loading @@ -117,9 +202,9 @@ void readMatrixVector( Matrix< Real, Device, Index>& matrix, // Get matrix Matrices::MatrixReader< MatrixHost > m; m.readMtxFile( "./test-matrices/" + matrixName, matrixHost, verbose ); if( verbose > 2 ) matrixHost.showMatrix(); matrix = matrixHost; rows = matrixHost.getNumRows(); nonzeros = matrixHost.getNumNonzeros(); // Get vector Vector< Real, Devices::Host, Index > vectorHost( matrixHost.getNumRows() ); Loading @@ -132,9 +217,19 @@ void readMatrixVector( Matrix< Real, Device, Index>& matrix, if( verbose > 1 ) cout << "reading vector " << vectorName << endl; } #ifdef HAVE_MPI cutMatrixVectorMPI( matrixHost, vectorHost ); #endif if( verbose > 2 ) matrixHost.showMatrix(); if( verbose > 2 ) cout << vectorHost << endl; // Copy from CPU into matrix dependent on template Device vector = vectorHost; matrix = matrixHost; } Loading Loading @@ -186,3 +281,54 @@ void readVector( Vector< Real, Devices::Host, Index >& vector_host, const String } inFile.close(); } #ifdef HAVE_MPI template< typename Real, typename Index > void cutMatrixVectorMPI( Matrix< Real, Devices::Host, Index >& matrix, Vector< Real, Devices::Host, Index >& vector ) { Matrix< Real, Devices::Host, Index> matrixTemp; Vector< Real, Devices::Host, Index > vectorTemp; int processID; int numOfProcesses; MPI_Comm_rank( MPI_COMM_WORLD, &processID ); MPI_Comm_size( MPI_COMM_WORLD, &numOfProcesses ); if( processID == 0 ) { printf( "%d: %d\n", numOfProcesses, processID ); matrix.showMatrix(); cout << vector << endl; } Index numRowsCUT = TNL::roundUpDivision( matrix.getNumRows(), numOfProcesses ); matrixTemp.setDimensions( numRowsCUT, matrix.getNumRows() ); vectorTemp.setSize( numRowsCUT ); //printf( "%d: %d num of rows = %d\n", numOfProcesses, processID, numRowsCUT ); for( int j = 0; j < matrix.getNumColumns(); j++ ){ for( int i = 0; i < numRowsCUT; i++ ){ if( i + numRowsCUT * processID < matrix.getNumRows() ){ matrixTemp.setElement( i, j, matrix.getElement( i + numRowsCUT * processID, j ) ); }else{ matrixTemp.setElement(i,j,0); } } } for( int i = 0; i < numRowsCUT; i++ ){ if( i + numRowsCUT * processID < matrix.getNumRows() ){ vectorTemp.setElement( i, vector.getElement( i + numRowsCUT * processID ) ); }else{ vectorTemp.setElement( i, 0 ); } } matrix = matrixTemp; vector = vectorTemp; } #endif No newline at end of file Loading
src/Benchmarks/GEM/Makefile +6 −6 Original line number Diff line number Diff line Loading @@ -41,14 +41,14 @@ clean_curdir: dist: clean tar zcvf $(PROJECT_NAME)-src.tgz $(SUBDIRS) $(FILES) $(TARGETS): % : %.o $(CXX) $(LDFLAGS) -o $@ $< $(LDLIBS) #$(TARGETS): % : %.o # $(CXX) $(LDFLAGS) -L /home/maty/.openmpi/lib -lmpi -o $@ $< $(LDLIBS) $(CUDA_TARGETS): % : %.cu.o $(CUDA_CXX) $(CUDA_LDFLAGS) -o $@ $< $(CUDA_LDLIBS) $(CUDA_CXX) $(CUDA_LDFLAGS) -I /home/maty/.openmpi/include -L /home/maty/.openmpi/lib -lmpi -o $@ $< $(CUDA_LDLIBS) $(SOURCES:%.cpp=%.o): %.o: %.cpp $(CXX) $(CPPFLAGS) $(CXXFLAGS) -c -o $@ $< #$(SOURCES:%.cpp=%.o): %.o: %.cpp # $(CXX) $(CPPFLAGS) $(CXXFLAGS) -c -o $@ $< $(CUDA_SOURCES:%.cu=%.cu.o): %.cu.o : %.cu $(CUDA_CXX) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) -c -o $@ $< $(CUDA_CXX) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) -I /home/maty/.openmpi/include -c -o $@ $<
src/Benchmarks/GEM/Makefile.inc +3 −2 Original line number Diff line number Diff line Loading @@ -17,6 +17,7 @@ CUDA_ARCH = auto # Set-up compilers CXX = g++ CUDA_CXX =/usr/local/cuda-10.1/bin/nvcc #CUDA_CXX =/home/maty/.openmpi/bin/mpicc # Set-up CXX_FLAGS CXXFLAGS = -pthread -Wall -Wno-unused-local-typedefs -Wno-unused-variable -Wno-unknown-pragmas -std=c++14 -I$(TNL_HEADERS) Loading @@ -27,10 +28,10 @@ else endif # Set-up CUDA_CXXFLAGS CUDA_CXXFLAGS =-Wno-deprecated-gpu-targets --expt-relaxed-constexpr --expt-extended-lambda -DHAVE_CUDA --std c++14 -I$(TNL_HEADERS) CUDA_CXXFLAGS =-Wno-deprecated-gpu-targets --expt-relaxed-constexpr --expt-extended-lambda -DHAVE_CUDA -DHAVE_MPI --std c++14 -I$(TNL_HEADERS) ifeq ( $(WITH_CUDA), yes ) CUDA_CXXFLAGS += -DHAVE_CUDA CUDA_CXXFLAGS += -DHAVE_CUDA -DHAVE_MPI ifeq ( $(CUDA_ARCH), auto ) CUDA_CXXFLAGS += `tnl-cuda-arch` else Loading
src/Benchmarks/GEM/Matrix/Matrix.h +23 −3 Original line number Diff line number Diff line Loading @@ -51,7 +51,7 @@ class Matrix void setElement( Index row, Index col, Real value ); /** * Returns element on row and column but on device that it's saved! Cannot be * Returns element on row and column. Cannot be * called from host if the matrix is on device and also cannot be called from * device if the matrix is on host. * Loading @@ -60,11 +60,31 @@ class Matrix */ __cuda_callable__ Real getElement( Index row, Index col ) const; /** * Returns ROW on row and starting column. Cannot be * called from host if the matrix is on device and also cannot be called from * device if the matrix is on host. * * @param row and column, mainRow is array with size to be filled with values. * @return void. */ void getRow( Index row, Index col, Real* mainRow, Index size ); /** * Returns ROW on row and starting column. Can be * called from host if the matrix is on device and also can be called from * device if the matrix is on host. Values are saved in vector mainRow * * @param row and column, mainRow is output vector. * @return void */ void getRow( Index row, Index col, Vector& mainRow ); /** * Saves data array into array parameter even in cross device function matrix. * Data array is huge on device for coalesced memory access condition! * * @param VectorView eather on host or device, depends where we want to store the data. * @param Vector eather on host or device, depends where we want to store the data. * @return void */ template < typename Device1 > Loading
src/Benchmarks/GEM/Matrix/Matrix_impl.h +57 −8 Original line number Diff line number Diff line //#include "Matrix.h" #define DEBUG 0 template < typename Real, Loading Loading @@ -92,12 +95,65 @@ Real Matrix< Real, Device, Index >::getElement( Index row, Index col ) const return 1; } template < typename Real, typename Device, typename Index > void Matrix< Real, Device, Index >::getRow( Index row, Index col, Real* mainRow, Index size ) { TNL_ASSERT( row > -1 && col > -1, std::cerr << "Matrix cannot have egative row nor negative column!"); TNL_ASSERT( row < rows && col < columns, std::cerr << "Matrix dosn't have that much rows or columns!"); if( std::is_same< Device, TNL::Devices::Host >::value ) { #if DEBUG printf("On CPU\n"); #endif for( int i = 0; i < size-1; i++ ) mainRow[ i ] = this->getElement( row, col + i ); } #ifdef HAVE_CUDA if( std::is_same< Device, TNL::Devices::Cuda >::value ) { for( int i = 0; i < size-1; i++ ) mainRow[ i ] = this->data.getElement( row*TNL::roundToMultiple( this->columns, TNL::Cuda::getWarpSize() ) + col + i ); } #endif } template < typename Real, typename Device, typename Index > void Matrix< Real, Device, Index >::getRow( Index row, Index col, Vector& mainRow ) { TNL_ASSERT( row > -1 && col > -1, std::cerr << "Matrix cannot have egative row nor negative column!"); TNL_ASSERT( row < rows && col < columns, std::cerr << "Matrix dosn't have that much rows or columns!"); if( std::is_same< Device, TNL::Devices::Host >::value ) { #if DEBUG printf("On CPU\n"); #endif for( int i = 0; i < mainRow.getSize()-1; i++ ) mainRow[ i ] = this->getElement( row, col + i ); } #ifdef HAVE_CUDA if( std::is_same< Device, TNL::Devices::Cuda >::value ) { TNL::Containers::Vector< Real, TNL::Devices::Host, Index > tempVec( mainRow.getSize() ); for( int i = 0; i < mainRow.getSize()-1; i++ ) tempVec[ i ] = this->data.getElement( row*TNL::roundToMultiple( this->columns, TNL::Cuda::getWarpSize() ) + col + i ); mainRow = tempVec; } #endif } template < typename Real, typename Device, typename Index > template < typename Device1 > void Matrix< Real, Device, Index >::getData( TNL::Containers::Vector< Real, Device1, Index >& dataOut ) { /*dataOut.setSize( this->data.getSize() ); for( int i = 0; i < this->data.getSize(); i++ ) dataOut.setElement( i , this->data.getElement( i ));*/ dataOut = this->data; } Loading Loading @@ -161,31 +217,24 @@ Matrix< Real, Device, Index >::operator=( Matrix< Real, Device2, Index>& matrix matrix.getData( pom ); if( std::is_same< Device, Device2 >::value ) { //printf("copy host to host or device to device\n"); printf("copy host to host or device to device\n"); this->data = pom; } #ifdef HAVE_CUDA else if( std::is_same< Device, TNL::Devices::Host >::value ) { #if DEBUG printf("copy device to host\n"); #endif for( int i = 0; i < this->rows; i++ ) for( int j = 0; j < this->columns; j++ ) this->setElement(i,j, pom[ i*TNL::roundToMultiple( this->rows, TNL::Cuda::getWarpSize() ) + j ] ); } else if( std::is_same< Device, TNL::Devices::Cuda >::value ) { #if DEBUG printf("copy host to device\n"); printf("data size alocated: %d\n", this->data.getSize() ); #endif for( int i = 0; i < this->getNumRows(); i++ ) for( int j = 0; j < this->getNumColumns(); j++ ) { #if DEBUG printf("copying element i,j = %d, %d with value %.4f\n", i, j, pom.getElement( i * this->columns + j ) ); #endif this->data.setElement( i*TNL::roundToMultiple( this->columns, TNL::Cuda::getWarpSize() ) + j, pom.getElement( i * this->columns + j ) ); } Loading
src/Benchmarks/GEM/gem.h +199 −53 Original line number Diff line number Diff line Loading @@ -11,9 +11,15 @@ #include "TNL/tnl-dev/src/TNL/Cuda/MemoryHelpers.h" #include "TNL/tnl-dev/src/TNL/Devices/Host.h" #ifdef HAVE_MPI #include "TNL/tnl-dev/src/TNL/Communicators/MpiCommunicator.h" #include <mpi.h> #endif #include "gem/GEMkernels.h" #include "TNL/Cuda/MemoryHelpers.h" #define COMPARE_RESULTS true; // Prihodit GEMdeice do GEM a dělat dělení na device až tam -> přidat parametr pivoting using namespace TNL; using namespace TNL::Containers; Loading @@ -24,6 +30,12 @@ template < typename Real, typename Index > void calculHostVecOne( Matrix< Real, Devices::Host, Index >& matrix, Vector< Real, Devices::Host, Index >& vector, const String& vectorName ); #ifdef HAVE_MPI template< typename Real, typename Index > void cutMatrixVectorMPI( Matrix< Real, Devices::Host, Index >& matrix, Vector< Real, Devices::Host, Index >& vector ); #endif template < typename Real, typename Index > void readVector( Vector< Real, Devices::Host, Index >& vector_host, const String& vectorName ); Loading @@ -31,7 +43,8 @@ void readVector( Vector< Real, Devices::Host, Index >& vector_host, const String template < typename Real, typename Index, typename Device > void readMatrixVector( Matrix< Real, Device, Index>& matrix, Vector< Real, Device, Index >& vector, const String& matrixName, const String& vectorName, const int verbose ); const String& matrixName, const String& vectorName, Index &rows, Index &nonzeros, const int verbose ); template < typename Real, Loading @@ -42,12 +55,20 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve { typedef Matrix< Real, Device, Index > MatrixType; typedef Vector< Real, Device, Index > VectorType; int processID = 0; // MPI processID, without mpi == 0 #ifdef HAVE_MPI MPI_Comm_rank( MPI_COMM_WORLD, &processID ); Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD ); #endif MatrixType matrix; VectorType vector; readMatrixVector( matrix, vector, matrixName, vectorName, verbose ); VectorType vectorResult( matrix.getNumRows() ); Index rows, nonzeros; readMatrixVector( matrix, vector, matrixName, vectorName, rows, nonzeros, verbose ); VectorType vectorResult( rows ); vectorResult.setValue(0); MatrixType matrixComp; VectorType vectorComp( vector ); // Computation double* time; Loading @@ -58,22 +79,67 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve cout << "Starting computation on " << device << endl; for( int i = 0; i < loops; i++ ) { MatrixType matrixComp = matrix; VectorType vectorComp( vector ); vectorResult.setValue( 0 ); printf("process %d is waiting in barrier with i = %d, loops = %d\n", processID, i, loops ); #ifdef HAVE_MPI MPI_Barrier( MPI_COMM_WORLD ); #endif printf("process %d is behind barrier with i = %d, loops = %d and going to calculate next loop!\n", processID, i, loops ); printf("process %d is in loop %d from all loops %d\n", processID, i, loops ); #ifdef HAVE_CUDA printf( "%d process:\n", processID ); showMatrix<<< 1, 1 >>>( matrix ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; std::cout << vector << endl; cout << vectorResult << endl; #endif matrixComp = matrix; printf("%d: matrix coppied\n", processID ); #ifdef HAVE_MPI MPI_Barrier( MPI_COMM_WORLD ); #endif #ifdef HAVE_CUDA printf( "%d process:\n", processID ); showMatrix<<< 1, 1 >>>( matrixComp ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; #endif cout << processID << ":" << vector << endl; cout << processID << ":" << vectorComp << endl; #ifdef HAVE_MPI Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD ); #endif printf("%d:copiing Vector \n", processID); //vectorComp = vector; #ifdef HAVE_MPI Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD ); #endif //vectorResult.setValue( 0 ); #ifdef HAVE_MPI Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD ); #endif GEM< Real, Device, Index > gem( matrixComp, vectorComp ); std::clock_t start; #ifdef HAVE_MPI Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD ); #endif double duration; std::clock_t start; start = std::clock(); if( verbose > 1 ) if( verbose > 1 && processID == 0 ) cout << "starting computation number " << i+1 << endl; gem.solve( vectorResult, pivoting, verbose ); gem.solve( vectorResult, (String)"no", verbose ); duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC; #ifdef HAVE_MPI Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD ); #endif if( processID == 0 ) { time[i] = duration; if( vectorName == "none" ) Loading @@ -88,7 +154,11 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve printf( "Norm in %d calculation is %.4f\n", i+1, l2norm); } } if( verbose > 1 ) #ifdef HAVE_MPI Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD ); #endif } if( verbose > 1 && processID == 0 ) printf("\n ... done!\n"); double timeMean = 0; Loading @@ -96,19 +166,34 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve timeMean += time[i]; timeMean /= loops; if( processID == 0 ) printf("%20s %15d %15d %20s %15s %15s %10d %15.5f %15.5f\n", matrixName.c_str(), matrix.getNumRows(), matrix.getNumNonzeros(), vectorName == "none" ? "-":vectorName.c_str(), rows, nonzeros, vectorName == "none" ? "-":vectorName.c_str(), device.c_str(), typeid(Real).name() == (string)"f" ? "float":"double", loops, timeMean, error); delete []time; #ifdef HAVE_MPI Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD ); #endif if( processID == 0 ){ printf("%d: returning\n", processID ); return vectorResult; } else{ printf("%d: returning\n", processID ); vectorResult.setValue( 0 ); return vectorResult; } } template < typename Real, typename Index, typename Device > void readMatrixVector( Matrix< Real, Device, Index>& matrix, Vector< Real, Device, Index >& vector, const String& matrixName, const String& vectorName, const int verbose ) const String& matrixName, const String& vectorName, Index &rows, Index &nonzeros, const int verbose ) { typedef Matrix< Real, Devices::Host, Index> MatrixHost; MatrixHost matrixHost; Loading @@ -117,9 +202,9 @@ void readMatrixVector( Matrix< Real, Device, Index>& matrix, // Get matrix Matrices::MatrixReader< MatrixHost > m; m.readMtxFile( "./test-matrices/" + matrixName, matrixHost, verbose ); if( verbose > 2 ) matrixHost.showMatrix(); matrix = matrixHost; rows = matrixHost.getNumRows(); nonzeros = matrixHost.getNumNonzeros(); // Get vector Vector< Real, Devices::Host, Index > vectorHost( matrixHost.getNumRows() ); Loading @@ -132,9 +217,19 @@ void readMatrixVector( Matrix< Real, Device, Index>& matrix, if( verbose > 1 ) cout << "reading vector " << vectorName << endl; } #ifdef HAVE_MPI cutMatrixVectorMPI( matrixHost, vectorHost ); #endif if( verbose > 2 ) matrixHost.showMatrix(); if( verbose > 2 ) cout << vectorHost << endl; // Copy from CPU into matrix dependent on template Device vector = vectorHost; matrix = matrixHost; } Loading Loading @@ -186,3 +281,54 @@ void readVector( Vector< Real, Devices::Host, Index >& vector_host, const String } inFile.close(); } #ifdef HAVE_MPI template< typename Real, typename Index > void cutMatrixVectorMPI( Matrix< Real, Devices::Host, Index >& matrix, Vector< Real, Devices::Host, Index >& vector ) { Matrix< Real, Devices::Host, Index> matrixTemp; Vector< Real, Devices::Host, Index > vectorTemp; int processID; int numOfProcesses; MPI_Comm_rank( MPI_COMM_WORLD, &processID ); MPI_Comm_size( MPI_COMM_WORLD, &numOfProcesses ); if( processID == 0 ) { printf( "%d: %d\n", numOfProcesses, processID ); matrix.showMatrix(); cout << vector << endl; } Index numRowsCUT = TNL::roundUpDivision( matrix.getNumRows(), numOfProcesses ); matrixTemp.setDimensions( numRowsCUT, matrix.getNumRows() ); vectorTemp.setSize( numRowsCUT ); //printf( "%d: %d num of rows = %d\n", numOfProcesses, processID, numRowsCUT ); for( int j = 0; j < matrix.getNumColumns(); j++ ){ for( int i = 0; i < numRowsCUT; i++ ){ if( i + numRowsCUT * processID < matrix.getNumRows() ){ matrixTemp.setElement( i, j, matrix.getElement( i + numRowsCUT * processID, j ) ); }else{ matrixTemp.setElement(i,j,0); } } } for( int i = 0; i < numRowsCUT; i++ ){ if( i + numRowsCUT * processID < matrix.getNumRows() ){ vectorTemp.setElement( i, vector.getElement( i + numRowsCUT * processID ) ); }else{ vectorTemp.setElement( i, 0 ); } } matrix = matrixTemp; vector = vectorTemp; } #endif No newline at end of file