Loading src/Benchmarks/GEM/Makefile.inc +15 −15 Original line number Diff line number Diff line Loading @@ -5,11 +5,13 @@ PROJECT_NAME = tnl-gem TNL_HEADERS = ${HOME}/.local/include INSTALL_DIR = ${HOME}/.local WITH_CUDA = yes WITH_MPI = yes WITH_MPI = no WITH_OPENMP = yes WITH_DEBUG = yes MPI_FLAGT =-L/home/maty/.openmpi/lib -lmpi MPI_FLAGS =-I/home/maty/.openmpi/include MPI_FLAGT = MPI_FLAGS = #MPI_FLAGT =-L/home/maty/.openmpi/lib -lmpi #MPI_FLAGS =-I/home/maty/.openmpi/include # If TNL is installed on your system, CUDA arch can be detected automatically using # a tool 'tnl-cuda-arch'. This is done by default if CUDA_ARCH is set to 'auto'. Loading @@ -20,7 +22,6 @@ CUDA_ARCH = auto # Set-up compilers CXX = g++ CUDA_CXX =/usr/local/cuda-10.1/bin/nvcc #CUDA_CXX =/home/maty/.openmpi/bin/mpicc # Set-up CXX_FLAGS CXXFLAGS = -pthread -Wall -Wno-unused-local-typedefs -Wno-unused-variable -Wno-unknown-pragmas -std=c++14 -I$(TNL_HEADERS) Loading @@ -31,10 +32,10 @@ else endif # Set-up CUDA_CXXFLAGS CUDA_CXXFLAGS =-Wno-deprecated-gpu-targets --expt-relaxed-constexpr --expt-extended-lambda -DHAVE_CUDA -DHAVE_MPI --std c++14 -I$(TNL_HEADERS) CUDA_CXXFLAGS = -Wno-deprecated-gpu-targets --expt-relaxed-constexpr --expt-extended-lambda --std c++14 -I$(TNL_HEADERS) ifeq ($(WITH_CUDA), yes) CUDA_CXXFLAGS += -DHAVE_CUDA -DHAVE_MPI CUDA_CXXFLAGS += -DHAVE_CUDA ifeq ($(CUDA_ARCH), auto) CUDA_CXXFLAGS += `tnl-cuda-arch` else Loading @@ -42,11 +43,10 @@ ifeq ( $(WITH_CUDA), yes ) endif endif # Set-up MPI_FLAG # Set-up MPI_FLAGT ifeq ($(WITH_MPI), yes) MPI_FLAGT =-L/home/maty/.openmpi/lib -lmpi MPI_FLAGS =-I/home/maty/.openmpi/include MPI_FLAGT += -L/home/maty/.openmpi/lib -lmpi MPI_FLAGS += -I/home/maty/.openmpi/include -DHAVE_MPI endif # Set-up CPPFLAGS Loading src/Benchmarks/GEM/gem/GEMdevice.h +133 −170 Original line number Diff line number Diff line Loading @@ -58,6 +58,8 @@ void calculResultVector( Matrix< Real, TNL::Devices::Cuda, Index >& matrix, } } //std::cout << processID << ": " << x << std::endl; #else x = device_vector; #endif cudaFree( ( void* ) devMat ); TNL_CHECK_CUDA_DEVICE; Loading @@ -71,7 +73,8 @@ template < typename Real, typename Index > bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting, int verbose ) { Matrix< Real, TNL::Devices::Cuda, Index >* devMat;// = TNL::Cuda::passToDevice( this->A ); // Copy matrix A and vector b to GPU Matrix< Real, TNL::Devices::Cuda, Index >* devMat; cudaMalloc( ( void** ) &devMat, ( size_t ) sizeof( Matrix< Real, TNL::Devices::Cuda, Index > ) ); cudaMemcpy( ( void* ) devMat,( void* ) &this->A, sizeof( Matrix< Real, TNL::Devices::Cuda, Index > ), cudaMemcpyHostToDevice ); TNL_CHECK_CUDA_DEVICE; Loading @@ -79,15 +82,8 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting // FOR PIVOTING SET VARIABLES ON DEVICE int* pivot; cudaMalloc(&pivot, sizeof(int)); int* pom = (int*)malloc(sizeof(int));// *pom = -1; if( verbose > 2 ) { showMatrix<<< 1, 1 >>>( this->A ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; printf("\n"); } // Initialise MPI variables even without MPI int processID=0; int numOfProcesses=1; Loading @@ -95,72 +91,86 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting MPI_Comm_rank( MPI_COMM_WORLD, &processID ); MPI_Comm_size( MPI_COMM_WORLD, &numOfProcesses ); #endif Index colPointerMain = 0; while( colPointerMain < x.getSize() ){ #ifdef HAVE_MPI TNL::Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD ); #endif Index colPointer = colPointerMain-(Index)(colPointerMain/this->A.getNumRows()) * this->A.getNumRows(); if( verbose > 2 ) { #ifdef HAVE_MPI if( processID == 0 ) for( int i = 0; i < numOfProcesses; i++ ) { showMatrix<<< 1, 1 >>>( this->A ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; } MPI_Barrier(MPI_COMM_WORLD); if( processID == 1 ) if( processID == i ) { showMatrix<<< 1, 1 >>>( this->A ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; } MPI_Barrier(MPI_COMM_WORLD); if( processID == 2 ) } for( int i = 0; i < numOfProcesses; i++ ) { if( i == processID ) std::cout << device_vector; MPI_Barrier(MPI_COMM_WORLD); } std::cout << std::endl; #else showMatrix<<< 1, 1 >>>( this->A ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; printf("\n"); #endif } MPI_Barrier(MPI_COMM_WORLD); // Main pointer to row, over all parts of matrices, colPointerMain in (0 - number of rows) Index colPointerMain = 0; // Main cycle for all rows across all MPI parts, vector x is the only one with full size on MPI, or use A.getNumColumns() for rectangular matrices. while( colPointerMain < x.getSize() ){ #ifdef HAVE_MPI TNL::Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD ); #endif // Pointer to rows in main process that has colPointerMain. Index colPointer = colPointerMain-TNL::floor(colPointerMain/this->A.getNumRows()) * this->A.getNumRows(); // main row vector for computation (pivoting, non-pivoting) TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > mainRowVec; // Setting number of threads and blocks for main kernel and for pivoting swapping kernel int blockSize = (this->A.getNumColumns()-colPointer) > 1024 ? 1024 : this->A.getNumColumns()-colPointer; int numBlocksOnRow = TNL::roundUpDivision( (this->A.getNumColumns()-colPointer), 1024 ); int numOfBlocks = this->A.getNumRows() * numBlocksOnRow; if( pivoting == "yes" ) { // PIVOTING // fromRow saves info for each process saying from which row to start looking for pivot Index fromRow = 0; if( processID < (Index)(colPointerMain/this->A.getNumRows()) ) if( processID < TNL::floor(colPointerMain/this->A.getNumRows()) ) fromRow = this->A.getNumRows(); else if( processID == (Index)(colPointerMain/this->A.getNumRows()) ) else if( processID == TNL::floor(colPointerMain/this->A.getNumRows()) ) fromRow = colPointer; // Inicialising vectors for maximum and position (vectors important for multiple blocks, otherwise its vector with length 1) TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > outMax(1); TNL::Containers::Vector< Index, TNL::Devices::Cuda, Index > outPos(1); outMax.setValue(0); outPos.setValue(-1); // those blocks that have rows to look for pivot in should start looking if( fromRow != this->A.getNumRows() ) { // setting size for kernel of pivoting int reduceBlockSize = (this->A.getNumRows()-fromRow) > 1024 ? 1024 : TNL::roundToMultiple( this->A.getNumRows()-fromRow, 32 ); int reduceGridSize = TNL::roundUpDivision( this->A.getNumRows()-fromRow, reduceBlockSize ); int reduceGridSizeRound = TNL::roundToMultiple( reduceGridSize, 32 ); printf("%d,%d: reduceBlockSize = %d, reduceGridSize = %d, reduceGridSizeRound = %d %d\n", colPointerMain, processID, reduceBlockSize, reduceGridSize, reduceGridSizeRound, fromRow ); // resizing outMax and outPos for the kernel of pivoting outMax.setSize( reduceGridSize ); outPos.setSize( reduceGridSize ); //outMax.setValue(0); outPos.setValue(0); // two main pivoting kernels executes and saves max and position into 0-th element of vectors outMax and outPos findPivot<<< reduceGridSize, reduceBlockSize >>>( devMat, fromRow, colPointerMain, outMax.getView(), outPos.getView() ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; Loading @@ -168,11 +178,13 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting findRowPivot<<< 1, reduceGridSizeRound >>>( outMax.getView(), outPos.getView(), pivot ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; //*pom = 0; //cudaMemcpy( pom, pivot, sizeof(int), cudaMemcpyDeviceToHost); } //std::cout << processID << ": " << outMax << outPos << std::endl; // Now each process has info about maximum in the "active" part of matrix that they calculate with // Preparing dynamic arrays for MPI send and recv resp. AllGather. // data stores information to send from each process // recvData stores information that is received #ifdef HAVE_MPI Real *data, *recvData; data = new Real[2]; recvData = new Real[2*numOfProcesses]; Loading @@ -181,80 +193,48 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting MPI_Allgather( data, 2, TNL::Communicators::MPITypeResolver< Real >::getType(), recvData, 2, TNL::Communicators::MPITypeResolver< Real >::getType(), MPI_COMM_WORLD); #endif /*MPI_Barrier( MPI_COMM_WORLD ); if( processID == 0 ) { printf( "%d:", processID ); for( int i = 0; i < 2*numOfProcesses; i++ ) printf( "%.2f ", recvData[i] ); printf("\n"); } MPI_Barrier( MPI_COMM_WORLD ); if( processID == 1 ) { printf( "%d:", processID ); for( int i = 0; i < 2*numOfProcesses; i++ ) printf( "%.2f ", recvData[i] ); printf("\n"); }*/ TNL::Containers::Vector< Real, TNL::Devices::Host, Index > outMaxHost( numOfProcesses ); TNL::Containers::Vector< Index, TNL::Devices::Host, Index > outPosHost( numOfProcesses ); // Initialising maximum and possition and process that has the overall maximum ( ?: for non-MPI program) Index ProcessMax = numOfProcesses != 1 ? -1 : 0; Index Possition = numOfProcesses != 1 ? 0 : outPos.getElement(0); Real Maximum = numOfProcesses != 1 ? 0 : outMax.getElement(0); for( int i = 0; i < numOfProcesses; i++ ) #ifdef HAVE_MPI // clasic maximum finding + storing processMax and possition for( int i = 0; i < 2*numOfProcesses; i = i+2 ) { if( recvData[ i+1 ] != -1 ) if( Maximum < recvData[ i ] ) { outMaxHost[ i ] = (Real)recvData[ 2*i ]; outPosHost[ i ] = (Index)recvData[ 2*i+1 ]; Maximum = recvData[ i ]; ProcessMax = TNL::floor( i/2 ); Possition = recvData[ i+1 ]; } } if( verbose > 1 && processID == 0 ) printf("%d: max = %.2f, possition = %d, process = %d\n", colPointerMain, Maximum, Possition, ProcessMax ); // All processes has the info in Maximum, ProcesMax and Possition. So deleting arrays. delete []data; delete []recvData; /*if( processID == 0 ) { printf( "%d:", processID ); std::cout << outMaxHost << std::endl; std::cout << outPosHost << std::endl; } MPI_Barrier( MPI_COMM_WORLD ); if( processID == 1 ) { printf( "%d: ", processID ); std::cout << outMaxHost << std::endl; std::cout << outPosHost << std::endl; }*/ Index ProcessMax = -1; Index Possition = -1; Real Maximum = 0; for( int i = 0; i < numOfProcesses; i++ ) { if( outPosHost[ i ] != -1 ) if( Maximum < outMaxHost[ i ] ) #endif // Clasic Maximum == 0 then we occured zero pivot so ending this calculation if( Maximum == 0 ) { Maximum = outMaxHost[ i ]; ProcessMax = i; Possition = outPosHost[ i ]; } std::cout << "Ooops zero pivot occured in " << colPointerMain << "-th step." << std::endl; return false; } /*if( processID == 0 ) { printf( "%d: ", processID ); std::cout << ProcessMax << " " << Maximum << " " << Possition << std::endl; } MPI_Barrier( MPI_COMM_WORLD ); if( processID == 1 ) { printf( "%d: ", processID ); std::cout << ProcessMax << " " << Maximum << " " << Possition << std::endl; }*/ // Now when every process has the ProcessMax of pivoting row across all processes // we can send pivoting row to all processes from ProcessMax // mainRow stores pivoting row Real* mainRow; int size = this->A.getNumColumns() - colPointerMain + 1; mainRow = new Real[size]; // If ProcessMax isn't the main process that contains colPointerMain then ProcessMax sets mainRow itself. // else means that ProcessMax is in the main process that contains colPointerMain, in this case we need to do normal pivoting // so it swaps rows and fills mainRow normally. if( ProcessMax != colPointerMain/this->A.getNumRows() ) { if( processID == ProcessMax ) Loading @@ -280,24 +260,22 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting mainRow[ size-1 ] = this->b.getElement( colPointer ); } } TNL::Communicators::MpiCommunicator::Bcast( mainRow, size, colPointerMain/this->A.getNumRows(), MPI_COMM_WORLD); TNL::Containers::Vector< Real, TNL::Devices::Host, Index > mainRowVecHost( mainRow, size ); mainRowVec = mainRowVecHost; delete []mainRow; // Broad casting the pivoting row to all processes #ifdef HAVE_MPI TNL::Communicators::MpiCommunicator::Bcast( mainRow, size, ProcessMax, MPI_COMM_WORLD); // Onec more if the ProcessMax filled the mainRow, then the ProcessMax needs to switch this pivoting row with main process. // mainRowSwap is the colPointer of process colPointerMain/this->A.getNumRows() if( ProcessMax != colPointerMain/this->A.getNumRows() ) { Real *mainRowSwap; mainRowSwap = new Real[size]; if( processID == ProcessMax ) { TNL::Communicators::MpiCommunicator::Recv( mainRowSwap, size, colPointerMain/this->A.getNumRows(), 0 ); printf( "%d: ", processID ); for( int i = 0; i < size; i++ ) printf( "%.2f ", mainRowSwap[ i ] ); printf( "\n" ); this->A.setRow( Possition, colPointerMain, mainRowSwap, size ); this->b.setElement( Possition, mainRowSwap[ size-1 ] ); } Loading @@ -307,48 +285,18 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting mainRowSwap[ size-1 ] = this->b.getElement( colPointer ); TNL::Communicators::MpiCommunicator::ISend( mainRowSwap, size, ProcessMax, 0 ); printf( "%d: ", processID ); for( int i = 0; i < size; i++ ) printf( "%.2f ", mainRow[ i ] ); printf( "\n" ); this->A.setRow( colPointer, colPointerMain, mainRow, size ); this->b.setElement( colPointer, mainRow[ size-1 ] ); } delete []mainRowSwap; } #ifdef HAVE_MPI if( processID == 0 ) { showMatrix<<< 1, 1 >>>( this->A ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; } MPI_Barrier(MPI_COMM_WORLD); if( processID == 1 ) { showMatrix<<< 1, 1 >>>( this->A ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; } MPI_Barrier(MPI_COMM_WORLD); if( processID == 2 ) { showMatrix<<< 1, 1 >>>( this->A ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; } MPI_Barrier(MPI_COMM_WORLD); if( processID == 0 ) std::cout << this->b << std::endl; MPI_Barrier(MPI_COMM_WORLD); if( processID == 1 ) std::cout << this->b << std::endl; MPI_Barrier(MPI_COMM_WORLD); if( processID == 2 ) std::cout << this->b << std::endl; #endif // Main kernel works with vector as a main row, so all processes has to set mainRowVec. TNL::Containers::Vector< Real, TNL::Devices::Host, Index > mainRowVecHost( mainRow, size ); mainRowVec = mainRowVecHost; delete []mainRow; } else else // without pivoting { #ifdef HAVE_MPI Real* mainRow; Loading @@ -358,14 +306,10 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting if( colPointerMain/this->A.getNumRows() == processID ){ this->A.getRow( colPointer, colPointerMain, mainRow, size ); mainRow[ size-1 ] = this->b.getElement( colPointer ); } /*for( int i = 0; i < numOfProcesses; i++ ) if( i != processID ){ TNL::Communicators::MpiCommunicator::ISend( mainRow, size, i, 0 ); }*/ } //else { //TNL::Communicators::MpiCommunicator::Recv( mainRow, size, colPointerMain/this->A.getNumRows(), 0 ); TNL::Communicators::MpiCommunicator::Bcast( mainRow, size, colPointerMain/this->A.getNumRows(), MPI_COMM_WORLD); if( verbose > 2 ) { printf( "%d: [", processID); Loading @@ -374,23 +318,18 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting printf("]\n"); } //} TNL::Containers::Vector< Real, TNL::Devices::Host, Index > mainRowVecHost( mainRow, size ); mainRowVec = mainRowVecHost; delete []mainRow; #else //printf("%d: colPointer = %d, colPointerMain = %d\n", processID, colPointer, colPointerMain ); this->A.getRow(colPointer, colPointerMain, mainRowVec ); #endif } /*std::cout << processID << ": " << std::endl; std::cout << mainRowVec << std::endl;*/ if( verbose > 1 ) printf( "Elimination: %d/%d\n", colPointer, this->A.getNumColumns() ); // Finally main kernel calculates the GEM for colPointerMain from mainRowVec GEMmainKernel<<< numOfBlocks, blockSize >>>( devMat, device_vector.getView(), mainRowVec.getView(), Loading @@ -401,23 +340,47 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting numOfProcesses ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; if( verbose > 2 ) { #ifdef HAVE_MPI for( int i = 0; i < numOfProcesses; i++ ) { if( processID == i ) { showMatrix<<< 1, 1 >>>( this->A ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; } MPI_Barrier(MPI_COMM_WORLD); } for( int i = 0; i < numOfProcesses; i++ ) { if( i == processID ) std::cout << device_vector; MPI_Barrier(MPI_COMM_WORLD); } std::cout << std::endl; #else showMatrix<<< 1, 1 >>>( this->A ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; std::cout << this->b << std::endl; printf("\n"); #endif } //TNL::Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD ); // increment colPointerMain for next while passage colPointerMain++; } // delete all used variables cudaFree(pivot); free(pom); cudaFree( devMat ); TNL_CHECK_CUDA_DEVICE; // Calculate real result // (With MPI needs to send info into process 0 as main process with real result, rest processes has result as vector of zeros) calculResultVector( this->A, device_vector, x, processID, numOfProcesses ); return true; Loading src/Benchmarks/GEM/main.h +12 −5 Original line number Diff line number Diff line Loading @@ -46,8 +46,16 @@ int main( int argc, char* argv[] ) MPI_Init(NULL, NULL); // Get the number of processes int processID; int processID = -1; MPI_Comm_rank(MPI_COMM_WORLD, &processID); int numOfProcesses = 0; MPI_Comm_size( MPI_COMM_WORLD, &numOfProcesses ); int numOfDevices = 0; cudaGetDeviceCount(&numOfDevices); if( numOfProcesses > numOfDevices && processID == 0 ) printf("Warning: There is too many processes for computation," " some processes will compute on same device!. (Processes %d, Devices %d)\n", numOfProcesses, numOfDevices); cudaSetDevice(processID%numOfDevices); #endif Loading Loading @@ -85,10 +93,6 @@ int main( int argc, char* argv[] ) auto result = runGEM< float, int, TNL::Devices::Cuda >( matrixName, vectorName, loops, verbose, (String)"GPU", pivoting ); } #ifdef HAVE_MPI //printf("%d: about to finalize.\n", processID ); MPI_Finalize(); #endif if( ( precision == "all" || precision == "double" ) ) { Loading @@ -100,6 +104,9 @@ int main( int argc, char* argv[] ) runGEM< double, int, TNL::Devices::Cuda >( matrixName, vectorName, loops, verbose, (String)"GPU", pivoting ); } #ifdef HAVE_MPI MPI_Finalize(); #endif return EXIT_SUCCESS; } No newline at end of file Loading
src/Benchmarks/GEM/Makefile.inc +15 −15 Original line number Diff line number Diff line Loading @@ -5,11 +5,13 @@ PROJECT_NAME = tnl-gem TNL_HEADERS = ${HOME}/.local/include INSTALL_DIR = ${HOME}/.local WITH_CUDA = yes WITH_MPI = yes WITH_MPI = no WITH_OPENMP = yes WITH_DEBUG = yes MPI_FLAGT =-L/home/maty/.openmpi/lib -lmpi MPI_FLAGS =-I/home/maty/.openmpi/include MPI_FLAGT = MPI_FLAGS = #MPI_FLAGT =-L/home/maty/.openmpi/lib -lmpi #MPI_FLAGS =-I/home/maty/.openmpi/include # If TNL is installed on your system, CUDA arch can be detected automatically using # a tool 'tnl-cuda-arch'. This is done by default if CUDA_ARCH is set to 'auto'. Loading @@ -20,7 +22,6 @@ CUDA_ARCH = auto # Set-up compilers CXX = g++ CUDA_CXX =/usr/local/cuda-10.1/bin/nvcc #CUDA_CXX =/home/maty/.openmpi/bin/mpicc # Set-up CXX_FLAGS CXXFLAGS = -pthread -Wall -Wno-unused-local-typedefs -Wno-unused-variable -Wno-unknown-pragmas -std=c++14 -I$(TNL_HEADERS) Loading @@ -31,10 +32,10 @@ else endif # Set-up CUDA_CXXFLAGS CUDA_CXXFLAGS =-Wno-deprecated-gpu-targets --expt-relaxed-constexpr --expt-extended-lambda -DHAVE_CUDA -DHAVE_MPI --std c++14 -I$(TNL_HEADERS) CUDA_CXXFLAGS = -Wno-deprecated-gpu-targets --expt-relaxed-constexpr --expt-extended-lambda --std c++14 -I$(TNL_HEADERS) ifeq ($(WITH_CUDA), yes) CUDA_CXXFLAGS += -DHAVE_CUDA -DHAVE_MPI CUDA_CXXFLAGS += -DHAVE_CUDA ifeq ($(CUDA_ARCH), auto) CUDA_CXXFLAGS += `tnl-cuda-arch` else Loading @@ -42,11 +43,10 @@ ifeq ( $(WITH_CUDA), yes ) endif endif # Set-up MPI_FLAG # Set-up MPI_FLAGT ifeq ($(WITH_MPI), yes) MPI_FLAGT =-L/home/maty/.openmpi/lib -lmpi MPI_FLAGS =-I/home/maty/.openmpi/include MPI_FLAGT += -L/home/maty/.openmpi/lib -lmpi MPI_FLAGS += -I/home/maty/.openmpi/include -DHAVE_MPI endif # Set-up CPPFLAGS Loading
src/Benchmarks/GEM/gem/GEMdevice.h +133 −170 Original line number Diff line number Diff line Loading @@ -58,6 +58,8 @@ void calculResultVector( Matrix< Real, TNL::Devices::Cuda, Index >& matrix, } } //std::cout << processID << ": " << x << std::endl; #else x = device_vector; #endif cudaFree( ( void* ) devMat ); TNL_CHECK_CUDA_DEVICE; Loading @@ -71,7 +73,8 @@ template < typename Real, typename Index > bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting, int verbose ) { Matrix< Real, TNL::Devices::Cuda, Index >* devMat;// = TNL::Cuda::passToDevice( this->A ); // Copy matrix A and vector b to GPU Matrix< Real, TNL::Devices::Cuda, Index >* devMat; cudaMalloc( ( void** ) &devMat, ( size_t ) sizeof( Matrix< Real, TNL::Devices::Cuda, Index > ) ); cudaMemcpy( ( void* ) devMat,( void* ) &this->A, sizeof( Matrix< Real, TNL::Devices::Cuda, Index > ), cudaMemcpyHostToDevice ); TNL_CHECK_CUDA_DEVICE; Loading @@ -79,15 +82,8 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting // FOR PIVOTING SET VARIABLES ON DEVICE int* pivot; cudaMalloc(&pivot, sizeof(int)); int* pom = (int*)malloc(sizeof(int));// *pom = -1; if( verbose > 2 ) { showMatrix<<< 1, 1 >>>( this->A ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; printf("\n"); } // Initialise MPI variables even without MPI int processID=0; int numOfProcesses=1; Loading @@ -95,72 +91,86 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting MPI_Comm_rank( MPI_COMM_WORLD, &processID ); MPI_Comm_size( MPI_COMM_WORLD, &numOfProcesses ); #endif Index colPointerMain = 0; while( colPointerMain < x.getSize() ){ #ifdef HAVE_MPI TNL::Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD ); #endif Index colPointer = colPointerMain-(Index)(colPointerMain/this->A.getNumRows()) * this->A.getNumRows(); if( verbose > 2 ) { #ifdef HAVE_MPI if( processID == 0 ) for( int i = 0; i < numOfProcesses; i++ ) { showMatrix<<< 1, 1 >>>( this->A ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; } MPI_Barrier(MPI_COMM_WORLD); if( processID == 1 ) if( processID == i ) { showMatrix<<< 1, 1 >>>( this->A ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; } MPI_Barrier(MPI_COMM_WORLD); if( processID == 2 ) } for( int i = 0; i < numOfProcesses; i++ ) { if( i == processID ) std::cout << device_vector; MPI_Barrier(MPI_COMM_WORLD); } std::cout << std::endl; #else showMatrix<<< 1, 1 >>>( this->A ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; printf("\n"); #endif } MPI_Barrier(MPI_COMM_WORLD); // Main pointer to row, over all parts of matrices, colPointerMain in (0 - number of rows) Index colPointerMain = 0; // Main cycle for all rows across all MPI parts, vector x is the only one with full size on MPI, or use A.getNumColumns() for rectangular matrices. while( colPointerMain < x.getSize() ){ #ifdef HAVE_MPI TNL::Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD ); #endif // Pointer to rows in main process that has colPointerMain. Index colPointer = colPointerMain-TNL::floor(colPointerMain/this->A.getNumRows()) * this->A.getNumRows(); // main row vector for computation (pivoting, non-pivoting) TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > mainRowVec; // Setting number of threads and blocks for main kernel and for pivoting swapping kernel int blockSize = (this->A.getNumColumns()-colPointer) > 1024 ? 1024 : this->A.getNumColumns()-colPointer; int numBlocksOnRow = TNL::roundUpDivision( (this->A.getNumColumns()-colPointer), 1024 ); int numOfBlocks = this->A.getNumRows() * numBlocksOnRow; if( pivoting == "yes" ) { // PIVOTING // fromRow saves info for each process saying from which row to start looking for pivot Index fromRow = 0; if( processID < (Index)(colPointerMain/this->A.getNumRows()) ) if( processID < TNL::floor(colPointerMain/this->A.getNumRows()) ) fromRow = this->A.getNumRows(); else if( processID == (Index)(colPointerMain/this->A.getNumRows()) ) else if( processID == TNL::floor(colPointerMain/this->A.getNumRows()) ) fromRow = colPointer; // Inicialising vectors for maximum and position (vectors important for multiple blocks, otherwise its vector with length 1) TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > outMax(1); TNL::Containers::Vector< Index, TNL::Devices::Cuda, Index > outPos(1); outMax.setValue(0); outPos.setValue(-1); // those blocks that have rows to look for pivot in should start looking if( fromRow != this->A.getNumRows() ) { // setting size for kernel of pivoting int reduceBlockSize = (this->A.getNumRows()-fromRow) > 1024 ? 1024 : TNL::roundToMultiple( this->A.getNumRows()-fromRow, 32 ); int reduceGridSize = TNL::roundUpDivision( this->A.getNumRows()-fromRow, reduceBlockSize ); int reduceGridSizeRound = TNL::roundToMultiple( reduceGridSize, 32 ); printf("%d,%d: reduceBlockSize = %d, reduceGridSize = %d, reduceGridSizeRound = %d %d\n", colPointerMain, processID, reduceBlockSize, reduceGridSize, reduceGridSizeRound, fromRow ); // resizing outMax and outPos for the kernel of pivoting outMax.setSize( reduceGridSize ); outPos.setSize( reduceGridSize ); //outMax.setValue(0); outPos.setValue(0); // two main pivoting kernels executes and saves max and position into 0-th element of vectors outMax and outPos findPivot<<< reduceGridSize, reduceBlockSize >>>( devMat, fromRow, colPointerMain, outMax.getView(), outPos.getView() ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; Loading @@ -168,11 +178,13 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting findRowPivot<<< 1, reduceGridSizeRound >>>( outMax.getView(), outPos.getView(), pivot ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; //*pom = 0; //cudaMemcpy( pom, pivot, sizeof(int), cudaMemcpyDeviceToHost); } //std::cout << processID << ": " << outMax << outPos << std::endl; // Now each process has info about maximum in the "active" part of matrix that they calculate with // Preparing dynamic arrays for MPI send and recv resp. AllGather. // data stores information to send from each process // recvData stores information that is received #ifdef HAVE_MPI Real *data, *recvData; data = new Real[2]; recvData = new Real[2*numOfProcesses]; Loading @@ -181,80 +193,48 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting MPI_Allgather( data, 2, TNL::Communicators::MPITypeResolver< Real >::getType(), recvData, 2, TNL::Communicators::MPITypeResolver< Real >::getType(), MPI_COMM_WORLD); #endif /*MPI_Barrier( MPI_COMM_WORLD ); if( processID == 0 ) { printf( "%d:", processID ); for( int i = 0; i < 2*numOfProcesses; i++ ) printf( "%.2f ", recvData[i] ); printf("\n"); } MPI_Barrier( MPI_COMM_WORLD ); if( processID == 1 ) { printf( "%d:", processID ); for( int i = 0; i < 2*numOfProcesses; i++ ) printf( "%.2f ", recvData[i] ); printf("\n"); }*/ TNL::Containers::Vector< Real, TNL::Devices::Host, Index > outMaxHost( numOfProcesses ); TNL::Containers::Vector< Index, TNL::Devices::Host, Index > outPosHost( numOfProcesses ); // Initialising maximum and possition and process that has the overall maximum ( ?: for non-MPI program) Index ProcessMax = numOfProcesses != 1 ? -1 : 0; Index Possition = numOfProcesses != 1 ? 0 : outPos.getElement(0); Real Maximum = numOfProcesses != 1 ? 0 : outMax.getElement(0); for( int i = 0; i < numOfProcesses; i++ ) #ifdef HAVE_MPI // clasic maximum finding + storing processMax and possition for( int i = 0; i < 2*numOfProcesses; i = i+2 ) { if( recvData[ i+1 ] != -1 ) if( Maximum < recvData[ i ] ) { outMaxHost[ i ] = (Real)recvData[ 2*i ]; outPosHost[ i ] = (Index)recvData[ 2*i+1 ]; Maximum = recvData[ i ]; ProcessMax = TNL::floor( i/2 ); Possition = recvData[ i+1 ]; } } if( verbose > 1 && processID == 0 ) printf("%d: max = %.2f, possition = %d, process = %d\n", colPointerMain, Maximum, Possition, ProcessMax ); // All processes has the info in Maximum, ProcesMax and Possition. So deleting arrays. delete []data; delete []recvData; /*if( processID == 0 ) { printf( "%d:", processID ); std::cout << outMaxHost << std::endl; std::cout << outPosHost << std::endl; } MPI_Barrier( MPI_COMM_WORLD ); if( processID == 1 ) { printf( "%d: ", processID ); std::cout << outMaxHost << std::endl; std::cout << outPosHost << std::endl; }*/ Index ProcessMax = -1; Index Possition = -1; Real Maximum = 0; for( int i = 0; i < numOfProcesses; i++ ) { if( outPosHost[ i ] != -1 ) if( Maximum < outMaxHost[ i ] ) #endif // Clasic Maximum == 0 then we occured zero pivot so ending this calculation if( Maximum == 0 ) { Maximum = outMaxHost[ i ]; ProcessMax = i; Possition = outPosHost[ i ]; } std::cout << "Ooops zero pivot occured in " << colPointerMain << "-th step." << std::endl; return false; } /*if( processID == 0 ) { printf( "%d: ", processID ); std::cout << ProcessMax << " " << Maximum << " " << Possition << std::endl; } MPI_Barrier( MPI_COMM_WORLD ); if( processID == 1 ) { printf( "%d: ", processID ); std::cout << ProcessMax << " " << Maximum << " " << Possition << std::endl; }*/ // Now when every process has the ProcessMax of pivoting row across all processes // we can send pivoting row to all processes from ProcessMax // mainRow stores pivoting row Real* mainRow; int size = this->A.getNumColumns() - colPointerMain + 1; mainRow = new Real[size]; // If ProcessMax isn't the main process that contains colPointerMain then ProcessMax sets mainRow itself. // else means that ProcessMax is in the main process that contains colPointerMain, in this case we need to do normal pivoting // so it swaps rows and fills mainRow normally. if( ProcessMax != colPointerMain/this->A.getNumRows() ) { if( processID == ProcessMax ) Loading @@ -280,24 +260,22 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting mainRow[ size-1 ] = this->b.getElement( colPointer ); } } TNL::Communicators::MpiCommunicator::Bcast( mainRow, size, colPointerMain/this->A.getNumRows(), MPI_COMM_WORLD); TNL::Containers::Vector< Real, TNL::Devices::Host, Index > mainRowVecHost( mainRow, size ); mainRowVec = mainRowVecHost; delete []mainRow; // Broad casting the pivoting row to all processes #ifdef HAVE_MPI TNL::Communicators::MpiCommunicator::Bcast( mainRow, size, ProcessMax, MPI_COMM_WORLD); // Onec more if the ProcessMax filled the mainRow, then the ProcessMax needs to switch this pivoting row with main process. // mainRowSwap is the colPointer of process colPointerMain/this->A.getNumRows() if( ProcessMax != colPointerMain/this->A.getNumRows() ) { Real *mainRowSwap; mainRowSwap = new Real[size]; if( processID == ProcessMax ) { TNL::Communicators::MpiCommunicator::Recv( mainRowSwap, size, colPointerMain/this->A.getNumRows(), 0 ); printf( "%d: ", processID ); for( int i = 0; i < size; i++ ) printf( "%.2f ", mainRowSwap[ i ] ); printf( "\n" ); this->A.setRow( Possition, colPointerMain, mainRowSwap, size ); this->b.setElement( Possition, mainRowSwap[ size-1 ] ); } Loading @@ -307,48 +285,18 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting mainRowSwap[ size-1 ] = this->b.getElement( colPointer ); TNL::Communicators::MpiCommunicator::ISend( mainRowSwap, size, ProcessMax, 0 ); printf( "%d: ", processID ); for( int i = 0; i < size; i++ ) printf( "%.2f ", mainRow[ i ] ); printf( "\n" ); this->A.setRow( colPointer, colPointerMain, mainRow, size ); this->b.setElement( colPointer, mainRow[ size-1 ] ); } delete []mainRowSwap; } #ifdef HAVE_MPI if( processID == 0 ) { showMatrix<<< 1, 1 >>>( this->A ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; } MPI_Barrier(MPI_COMM_WORLD); if( processID == 1 ) { showMatrix<<< 1, 1 >>>( this->A ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; } MPI_Barrier(MPI_COMM_WORLD); if( processID == 2 ) { showMatrix<<< 1, 1 >>>( this->A ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; } MPI_Barrier(MPI_COMM_WORLD); if( processID == 0 ) std::cout << this->b << std::endl; MPI_Barrier(MPI_COMM_WORLD); if( processID == 1 ) std::cout << this->b << std::endl; MPI_Barrier(MPI_COMM_WORLD); if( processID == 2 ) std::cout << this->b << std::endl; #endif // Main kernel works with vector as a main row, so all processes has to set mainRowVec. TNL::Containers::Vector< Real, TNL::Devices::Host, Index > mainRowVecHost( mainRow, size ); mainRowVec = mainRowVecHost; delete []mainRow; } else else // without pivoting { #ifdef HAVE_MPI Real* mainRow; Loading @@ -358,14 +306,10 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting if( colPointerMain/this->A.getNumRows() == processID ){ this->A.getRow( colPointer, colPointerMain, mainRow, size ); mainRow[ size-1 ] = this->b.getElement( colPointer ); } /*for( int i = 0; i < numOfProcesses; i++ ) if( i != processID ){ TNL::Communicators::MpiCommunicator::ISend( mainRow, size, i, 0 ); }*/ } //else { //TNL::Communicators::MpiCommunicator::Recv( mainRow, size, colPointerMain/this->A.getNumRows(), 0 ); TNL::Communicators::MpiCommunicator::Bcast( mainRow, size, colPointerMain/this->A.getNumRows(), MPI_COMM_WORLD); if( verbose > 2 ) { printf( "%d: [", processID); Loading @@ -374,23 +318,18 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting printf("]\n"); } //} TNL::Containers::Vector< Real, TNL::Devices::Host, Index > mainRowVecHost( mainRow, size ); mainRowVec = mainRowVecHost; delete []mainRow; #else //printf("%d: colPointer = %d, colPointerMain = %d\n", processID, colPointer, colPointerMain ); this->A.getRow(colPointer, colPointerMain, mainRowVec ); #endif } /*std::cout << processID << ": " << std::endl; std::cout << mainRowVec << std::endl;*/ if( verbose > 1 ) printf( "Elimination: %d/%d\n", colPointer, this->A.getNumColumns() ); // Finally main kernel calculates the GEM for colPointerMain from mainRowVec GEMmainKernel<<< numOfBlocks, blockSize >>>( devMat, device_vector.getView(), mainRowVec.getView(), Loading @@ -401,23 +340,47 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting numOfProcesses ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; if( verbose > 2 ) { #ifdef HAVE_MPI for( int i = 0; i < numOfProcesses; i++ ) { if( processID == i ) { showMatrix<<< 1, 1 >>>( this->A ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; } MPI_Barrier(MPI_COMM_WORLD); } for( int i = 0; i < numOfProcesses; i++ ) { if( i == processID ) std::cout << device_vector; MPI_Barrier(MPI_COMM_WORLD); } std::cout << std::endl; #else showMatrix<<< 1, 1 >>>( this->A ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; std::cout << this->b << std::endl; printf("\n"); #endif } //TNL::Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD ); // increment colPointerMain for next while passage colPointerMain++; } // delete all used variables cudaFree(pivot); free(pom); cudaFree( devMat ); TNL_CHECK_CUDA_DEVICE; // Calculate real result // (With MPI needs to send info into process 0 as main process with real result, rest processes has result as vector of zeros) calculResultVector( this->A, device_vector, x, processID, numOfProcesses ); return true; Loading
src/Benchmarks/GEM/main.h +12 −5 Original line number Diff line number Diff line Loading @@ -46,8 +46,16 @@ int main( int argc, char* argv[] ) MPI_Init(NULL, NULL); // Get the number of processes int processID; int processID = -1; MPI_Comm_rank(MPI_COMM_WORLD, &processID); int numOfProcesses = 0; MPI_Comm_size( MPI_COMM_WORLD, &numOfProcesses ); int numOfDevices = 0; cudaGetDeviceCount(&numOfDevices); if( numOfProcesses > numOfDevices && processID == 0 ) printf("Warning: There is too many processes for computation," " some processes will compute on same device!. (Processes %d, Devices %d)\n", numOfProcesses, numOfDevices); cudaSetDevice(processID%numOfDevices); #endif Loading Loading @@ -85,10 +93,6 @@ int main( int argc, char* argv[] ) auto result = runGEM< float, int, TNL::Devices::Cuda >( matrixName, vectorName, loops, verbose, (String)"GPU", pivoting ); } #ifdef HAVE_MPI //printf("%d: about to finalize.\n", processID ); MPI_Finalize(); #endif if( ( precision == "all" || precision == "double" ) ) { Loading @@ -100,6 +104,9 @@ int main( int argc, char* argv[] ) runGEM< double, int, TNL::Devices::Cuda >( matrixName, vectorName, loops, verbose, (String)"GPU", pivoting ); } #ifdef HAVE_MPI MPI_Finalize(); #endif return EXIT_SUCCESS; } No newline at end of file