Loading src/Benchmarks/GEM/Matrix/Matrix.h +11 −3 Original line number Diff line number Diff line Loading @@ -62,15 +62,23 @@ class Matrix __cuda_callable__ Real getElement( Index row, Index col ) const; /** * Returns ROW on row and starting column. Cannot be * called from host if the matrix is on device and also cannot be called from * device if the matrix is on host. * Returns ROW on row and starting column. Can be * called from host only. * * @param row and column, mainRow is array with size to be filled with values. * @return void. */ void getRow( Index row, Index col, Real* mainRow, Index size ); /** * Sets ROW on row and starting column into matrix A. Can be * called from host only. * * @param row and column, mainRow is array with size to be filled with values. * @return void. */ void setRow( Index row, Index col, Real* mainRow, Index size ); /** * Returns ROW on row and starting column. Can be * called from host if the matrix is on device and also can be called from Loading src/Benchmarks/GEM/Matrix/Matrix_impl.h +24 −0 Original line number Diff line number Diff line Loading @@ -128,6 +128,30 @@ void Matrix< Real, Device, Index >::getRow( Index row, Index col, Real* mainRow, #endif } template < typename Real, typename Device, typename Index > void Matrix< Real, Device, Index >::setRow( Index row, Index col, Real* mainRow, Index size ) { TNL_ASSERT( row > -1 && col > -1, std::cerr << "Matrix cannot have egative row nor negative column!"); TNL_ASSERT( row < rows && col < columns, std::cerr << "Matrix dosn't have that much rows or columns!"); if( std::is_same< Device, TNL::Devices::Host >::value ) { #if DEBUG printf("On CPU\n"); #endif for( int i = 0; i < size-1; i++ ) this->setElement( row, col + i, mainRow[ i ] ); } #ifdef HAVE_CUDA if( std::is_same< Device, TNL::Devices::Cuda >::value ) { for( int i = 0; i < size-1; i++ ) this->data.setElement( row*TNL::roundToMultiple( this->columns, TNL::Cuda::getWarpSize() ) + col + i, mainRow[ i ] ); } #endif } template < typename Real, typename Device, typename Index > Loading src/Benchmarks/GEM/gem.h +1 −3 Original line number Diff line number Diff line Loading @@ -18,8 +18,6 @@ #include <mpi.h> #endif #include "gem/GEMkernels.h" #include "TNL/Cuda/MemoryHelpers.h" #define COMPARE_RESULTS true; Loading Loading @@ -99,7 +97,7 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve if( verbose > 1 && processID == 0 ) cout << "starting computation number " << i+1 << endl; gem.solve( vectorResult, (String)"no", verbose ); gem.solve( vectorResult, pivoting, verbose ); duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC; Loading src/Benchmarks/GEM/gem/GEMdevice.h +279 −66 Original line number Diff line number Diff line Loading @@ -3,9 +3,12 @@ #ifdef HAVE_CUDA //#include "GEMkernels.h" #include "GEMkernels.h" #ifdef HAVE_MPI #include "TNL/Communicators/MpiCommunicator.h" #include "TNL/Communicators/MPITypeResolver.h" #include <mpi.h> #endif Loading Loading @@ -91,92 +94,302 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting #ifdef HAVE_MPI MPI_Comm_rank( MPI_COMM_WORLD, &processID ); MPI_Comm_size( MPI_COMM_WORLD, &numOfProcesses ); Index colPointerMain = 0; #endif Index colPointerMain = 0; while( colPointerMain < x.getSize() ){ #ifdef HAVE_MPI TNL::Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD ); Index colPointer = colPointerMain-colPointerMain/this->A.getNumRows() * this->A.getNumRows(); Real* mainRow; int size = this->A.getNumColumns() - colPointerMain + 1; mainRow = new Real[size]; if( colPointerMain/this->A.getNumRows() == processID ){ this->A.getRow( colPointer, colPointerMain, mainRow, size ); mainRow[ size-1 ] = this->b.getElement( colPointer ); for( int i = 0; i < numOfProcesses; i++ ) if( i != processID ){ TNL::Communicators::MpiCommunicator::ISend( mainRow, size, i, 0 ); #endif Index colPointer = colPointerMain-(Index)(colPointerMain/this->A.getNumRows()) * this->A.getNumRows(); #ifdef HAVE_MPI if( processID == 0 ) { showMatrix<<< 1, 1 >>>( this->A ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; } } else { TNL::Communicators::MpiCommunicator::Recv( mainRow, size, colPointerMain/this->A.getNumRows(), 0 ); if( verbose > 2 ) MPI_Barrier(MPI_COMM_WORLD); if( processID == 1 ) { printf( "%d: [", processID); for( int i = 0; i < size; i++ ) printf( "%.2f ", mainRow[ i ] ); printf("]\n"); showMatrix<<< 1, 1 >>>( this->A ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; } MPI_Barrier(MPI_COMM_WORLD); if( processID == 2 ) { showMatrix<<< 1, 1 >>>( this->A ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; } TNL::Containers::Vector< Real, TNL::Devices::Host, Index > mainRowVecHost( mainRow, size ); TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > mainRowVec; mainRowVec = mainRowVecHost; delete []mainRow; #else TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > mainRowVec; //printf("%d: colPointer = %d, colPointerMain = %d\n", processID, colPointer, colPointerMain ); this->A.getRow(colPointer, colPointerMain, mainRowVec ); MPI_Barrier(MPI_COMM_WORLD); #endif /*std::cout << processID << ": " << std::endl; std::cout << mainRowVec << std::endl;*/ if( verbose > 1 ) printf( "Elimination: %d/%d\n", colPointer, this->A.getNumColumns() ); // main row vector for computation (pivoting, non-pivoting) TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > mainRowVec; /*if( pivoting == "yes" ) int blockSize = (this->A.getNumColumns()-colPointer) > 1024 ? 1024 : this->A.getNumColumns()-colPointer; int numBlocksOnRow = TNL::roundUpDivision( (this->A.getNumColumns()-colPointer), 1024 ); int numOfBlocks = this->A.getNumRows() * numBlocksOnRow; if( pivoting == "yes" ) { // PIVOTING int reduceBlockSize = (this->A.getNumColumns()-colPointer) > 1024 ? 1024 : TNL::roundToMultiple( this->A.getNumColumns()-colPointer, 32 ); int reduceGridSize = TNL::roundUpDivision( this->A.getNumColumns()-colPointer, reduceBlockSize ); Index fromRow = 0; if( processID < (Index)(colPointerMain/this->A.getNumRows()) ) fromRow = this->A.getNumRows(); else if( processID == (Index)(colPointerMain/this->A.getNumRows()) ) fromRow = colPointer; TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > outMax(1); TNL::Containers::Vector< Index, TNL::Devices::Cuda, Index > outPos(1); outMax.setValue(0); outPos.setValue(-1); if( fromRow != this->A.getNumRows() ) { int reduceBlockSize = (this->A.getNumRows()-fromRow) > 1024 ? 1024 : TNL::roundToMultiple( this->A.getNumRows()-fromRow, 32 ); int reduceGridSize = TNL::roundUpDivision( this->A.getNumRows()-fromRow, reduceBlockSize ); int reduceGridSizeRound = TNL::roundToMultiple( reduceGridSize, 32 ); //printf("%d, %d, %d\n", reduceBlockSize, reduceGridSize, reduceGridSizeRound ); TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > outMax(reduceGridSize); TNL::Containers::Vector< Index, TNL::Devices::Cuda, Index > outPos(reduceGridSize); printf("%d,%d: reduceBlockSize = %d, reduceGridSize = %d, reduceGridSizeRound = %d %d\n", colPointerMain, processID, reduceBlockSize, reduceGridSize, reduceGridSizeRound, fromRow ); outMax.setSize( reduceGridSize ); outPos.setSize( reduceGridSize ); //outMax.setValue(0); outPos.setValue(0); findPivot<<< reduceGridSize, reduceBlockSize >>>( devMat, colPointer, outMax.getView(), outPos.getView() ); findPivot<<< reduceGridSize, reduceBlockSize >>>( devMat, fromRow, colPointerMain, outMax.getView(), outPos.getView() ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; findRowPivot<<< 1, reduceGridSizeRound >>>( outMax.getView(), outPos.getView(), pivot ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; *pom = 0; cudaMemcpy( pom, pivot, sizeof(int), cudaMemcpyDeviceToHost); //*pom = 0; //cudaMemcpy( pom, pivot, sizeof(int), cudaMemcpyDeviceToHost); } //std::cout << processID << ": " << outMax << outPos << std::endl; Real *data, *recvData; data = new Real[2]; recvData = new Real[2*numOfProcesses]; data[0] = outMax.getElement(0); data[1] = outPos.getElement(0); MPI_Barrier( MPI_COMM_WORLD ); MPI_Allgather( data, 2, TNL::Communicators::MPITypeResolver< Real >::getType(), recvData, 2, TNL::Communicators::MPITypeResolver< Real >::getType(), MPI_COMM_WORLD); /*MPI_Barrier( MPI_COMM_WORLD ); if( processID == 0 ) { printf( "%d:", processID ); for( int i = 0; i < 2*numOfProcesses; i++ ) printf( "%.2f ", recvData[i] ); printf("\n"); } MPI_Barrier( MPI_COMM_WORLD ); if( processID == 1 ) { printf( "%d:", processID ); for( int i = 0; i < 2*numOfProcesses; i++ ) printf( "%.2f ", recvData[i] ); printf("\n"); }*/ int blockSize = (this->A.getNumColumns()-colPointer) > 1024 ? 1024 : this->A.getNumColumns()-colPointer; int numBlocksOnRow = TNL::roundUpDivision( (this->A.getNumColumns()-colPointer), 1024 ); int numOfBlocks = this->A.getNumRows() * numBlocksOnRow; TNL::Containers::Vector< Real, TNL::Devices::Host, Index > outMaxHost( numOfProcesses ); TNL::Containers::Vector< Index, TNL::Devices::Host, Index > outPosHost( numOfProcesses ); for( int i = 0; i < numOfProcesses; i++ ) { outMaxHost[ i ] = (Real)recvData[ 2*i ]; outPosHost[ i ] = (Index)recvData[ 2*i+1 ]; } delete []data; delete []recvData; /*if( processID == 0 ) { printf( "%d:", processID ); std::cout << outMaxHost << std::endl; std::cout << outPosHost << std::endl; } MPI_Barrier( MPI_COMM_WORLD ); if( processID == 1 ) { printf( "%d: ", processID ); std::cout << outMaxHost << std::endl; std::cout << outPosHost << std::endl; }*/ Index ProcessMax = -1; Index Possition = -1; Real Maximum = 0; for( int i = 0; i < numOfProcesses; i++ ) { if( outPosHost[ i ] != -1 ) if( Maximum < outMaxHost[ i ] ) { Maximum = outMaxHost[ i ]; ProcessMax = i; Possition = outPosHost[ i ]; } } /*if( processID == 0 ) { printf( "%d: ", processID ); std::cout << ProcessMax << " " << Maximum << " " << Possition << std::endl; } MPI_Barrier( MPI_COMM_WORLD ); if( processID == 1 ) { printf( "%d: ", processID ); std::cout << ProcessMax << " " << Maximum << " " << Possition << std::endl; }*/ Real* mainRow; int size = this->A.getNumColumns() - colPointerMain + 1; mainRow = new Real[size]; /*if( pivoting == "yes" )// && *pom != -1 && *pom != colPointer ) if( ProcessMax != colPointerMain/this->A.getNumRows() ) { if( processID == ProcessMax ) { this->A.getRow( Possition, colPointerMain, mainRow, size ); mainRow[ size-1 ] = this->b.getElement( Possition ); } } else { if( colPointerMain/this->A.getNumRows() == processID ){ if( Possition != colPointer ) { if( verbose > 1 ) { std::cout << std::endl; std::cout << "Choosing element at " << *pom << "-th row as pivot with value..." << std::endl; std::cout << "Swapping " << colPointer << "-th and " << *pom << "-th rows ... " << std::endl; std::cout << "Choosing element at " << Possition << "-th row as pivot with value..." << std::endl; std::cout << "Swapping " << colPointer << "-th and " << Possition << "-th rows ... " << std::endl; } swapRows<<< numBlocksOnRow, blockSize >>>( devMat, device_vector.getView(), colPointer, numBlocksOnRow, pivot ); swapRows<<< numBlocksOnRow, blockSize >>>( devMat, device_vector.getView(), colPointer, numBlocksOnRow, Possition ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; } this->A.getRow( colPointer, colPointerMain, mainRow, size ); mainRow[ size-1 ] = this->b.getElement( colPointer ); } } TNL::Communicators::MpiCommunicator::Bcast( mainRow, size, colPointerMain/this->A.getNumRows(), MPI_COMM_WORLD); TNL::Containers::Vector< Real, TNL::Devices::Host, Index > mainRowVecHost( mainRow, size ); mainRowVec = mainRowVecHost; delete []mainRow; if( ProcessMax != colPointerMain/this->A.getNumRows() ) { Real *mainRowSwap; mainRowSwap = new Real[size]; if( processID == ProcessMax ) { TNL::Communicators::MpiCommunicator::Recv( mainRowSwap, size, colPointerMain/this->A.getNumRows(), 0 ); printf( "%d: ", processID ); for( int i = 0; i < size; i++ ) printf( "%.2f ", mainRowSwap[ i ] ); printf( "\n" ); this->A.setRow( Possition, colPointerMain, mainRowSwap, size ); this->b.setElement( Possition, mainRowSwap[ size-1 ] ); } else if( processID == colPointerMain/this->A.getNumRows() ) { this->A.getRow( colPointer, colPointerMain, mainRowSwap, size ); mainRowSwap[ size-1 ] = this->b.getElement( colPointer ); TNL::Communicators::MpiCommunicator::ISend( mainRowSwap, size, ProcessMax, 0 ); printf( "%d: ", processID ); for( int i = 0; i < size; i++ ) printf( "%.2f ", mainRow[ i ] ); printf( "\n" ); this->A.setRow( colPointer, colPointerMain, mainRow, size ); this->b.setElement( colPointer, mainRow[ size-1 ] ); } delete []mainRowSwap; } #ifdef HAVE_MPI if( processID == 0 ) { showMatrix<<< 1, 1 >>>( this->A ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; } MPI_Barrier(MPI_COMM_WORLD); if( processID == 1 ) { showMatrix<<< 1, 1 >>>( this->A ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; } MPI_Barrier(MPI_COMM_WORLD); if( processID == 2 ) { showMatrix<<< 1, 1 >>>( this->A ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; } MPI_Barrier(MPI_COMM_WORLD); if( processID == 0 ) std::cout << this->b << std::endl; MPI_Barrier(MPI_COMM_WORLD); if( processID == 1 ) std::cout << this->b << std::endl; MPI_Barrier(MPI_COMM_WORLD); if( processID == 2 ) std::cout << this->b << std::endl; #endif } else { #ifdef HAVE_MPI Real* mainRow; int size = this->A.getNumColumns() - colPointerMain + 1; mainRow = new Real[size]; if( colPointerMain/this->A.getNumRows() == processID ){ this->A.getRow( colPointer, colPointerMain, mainRow, size ); mainRow[ size-1 ] = this->b.getElement( colPointer ); /*for( int i = 0; i < numOfProcesses; i++ ) if( i != processID ){ TNL::Communicators::MpiCommunicator::ISend( mainRow, size, i, 0 ); }*/ } //else { //TNL::Communicators::MpiCommunicator::Recv( mainRow, size, colPointerMain/this->A.getNumRows(), 0 ); TNL::Communicators::MpiCommunicator::Bcast( mainRow, size, colPointerMain/this->A.getNumRows(), MPI_COMM_WORLD); if( verbose > 2 ) { printf( "%d: [", processID); for( int i = 0; i < size; i++ ) printf( "%.2f ", mainRow[ i ] ); printf("]\n"); } //} TNL::Containers::Vector< Real, TNL::Devices::Host, Index > mainRowVecHost( mainRow, size ); mainRowVec = mainRowVecHost; delete []mainRow; #else //printf("%d: colPointer = %d, colPointerMain = %d\n", processID, colPointer, colPointerMain ); this->A.getRow(colPointer, colPointerMain, mainRowVec ); #endif } /*std::cout << processID << ": " << std::endl; std::cout << mainRowVec << std::endl;*/ if( verbose > 1 ) printf( "Elimination: %d/%d\n", colPointer, this->A.getNumColumns() ); GEMmainKernel<<< numOfBlocks, blockSize >>>( devMat, device_vector.getView(), Loading src/Benchmarks/GEM/gem/GEMkernels.h +5 −5 Original line number Diff line number Diff line Loading @@ -60,10 +60,10 @@ __inline__ __device__ void blockReduceArgMax(Real& val, int& index) template <typename Real > __global__ void findPivot( Matrix< Real, TNL::Devices::Cuda, int >* A, int colPointerMain, TNL::Containers::VectorView< Real, TNL::Devices::Cuda, int > outMaximum, int fromRow, int colPointerMain, TNL::Containers::VectorView< Real, TNL::Devices::Cuda, int > outMaximum, TNL::Containers::VectorView< int, TNL::Devices::Cuda, int > outPosition) { int rowPointer = threadIdx.x + blockDim.x * blockIdx.x + colPointerMain; int rowPointer = threadIdx.x + blockDim.x * blockIdx.x + fromRow; Real firstElementInRow = rowPointer >= A->getNumRows() ? 0 : TNL::abs( A->getElement(rowPointer, colPointerMain) ); int index = rowPointer; blockReduceArgMax( firstElementInRow, index ); Loading Loading @@ -95,12 +95,12 @@ template <typename Real > __global__ void swapRows( Matrix< Real, TNL::Devices::Cuda, int >* A, TNL::Containers::VectorView< Real, TNL::Devices::Cuda, int > b, int colPointerMain, int numBlocksOnRow, int* positionPivot ) int colPointerMain, int numBlocksOnRow, int positionPivot ) { if( *positionPivot > colPointerMain ) if( positionPivot > colPointerMain ) { int rowPointer1 = colPointerMain; int rowPointer2 = *positionPivot; int rowPointer2 = positionPivot; int colPointer = threadIdx.x + blockDim.x * (blockIdx.x % numBlocksOnRow) + colPointerMain; if( colPointer < A->getNumColumns() && rowPointer1 < A->getNumRows() ) { Loading Loading
src/Benchmarks/GEM/Matrix/Matrix.h +11 −3 Original line number Diff line number Diff line Loading @@ -62,15 +62,23 @@ class Matrix __cuda_callable__ Real getElement( Index row, Index col ) const; /** * Returns ROW on row and starting column. Cannot be * called from host if the matrix is on device and also cannot be called from * device if the matrix is on host. * Returns ROW on row and starting column. Can be * called from host only. * * @param row and column, mainRow is array with size to be filled with values. * @return void. */ void getRow( Index row, Index col, Real* mainRow, Index size ); /** * Sets ROW on row and starting column into matrix A. Can be * called from host only. * * @param row and column, mainRow is array with size to be filled with values. * @return void. */ void setRow( Index row, Index col, Real* mainRow, Index size ); /** * Returns ROW on row and starting column. Can be * called from host if the matrix is on device and also can be called from Loading
src/Benchmarks/GEM/Matrix/Matrix_impl.h +24 −0 Original line number Diff line number Diff line Loading @@ -128,6 +128,30 @@ void Matrix< Real, Device, Index >::getRow( Index row, Index col, Real* mainRow, #endif } template < typename Real, typename Device, typename Index > void Matrix< Real, Device, Index >::setRow( Index row, Index col, Real* mainRow, Index size ) { TNL_ASSERT( row > -1 && col > -1, std::cerr << "Matrix cannot have egative row nor negative column!"); TNL_ASSERT( row < rows && col < columns, std::cerr << "Matrix dosn't have that much rows or columns!"); if( std::is_same< Device, TNL::Devices::Host >::value ) { #if DEBUG printf("On CPU\n"); #endif for( int i = 0; i < size-1; i++ ) this->setElement( row, col + i, mainRow[ i ] ); } #ifdef HAVE_CUDA if( std::is_same< Device, TNL::Devices::Cuda >::value ) { for( int i = 0; i < size-1; i++ ) this->data.setElement( row*TNL::roundToMultiple( this->columns, TNL::Cuda::getWarpSize() ) + col + i, mainRow[ i ] ); } #endif } template < typename Real, typename Device, typename Index > Loading
src/Benchmarks/GEM/gem.h +1 −3 Original line number Diff line number Diff line Loading @@ -18,8 +18,6 @@ #include <mpi.h> #endif #include "gem/GEMkernels.h" #include "TNL/Cuda/MemoryHelpers.h" #define COMPARE_RESULTS true; Loading Loading @@ -99,7 +97,7 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve if( verbose > 1 && processID == 0 ) cout << "starting computation number " << i+1 << endl; gem.solve( vectorResult, (String)"no", verbose ); gem.solve( vectorResult, pivoting, verbose ); duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC; Loading
src/Benchmarks/GEM/gem/GEMdevice.h +279 −66 Original line number Diff line number Diff line Loading @@ -3,9 +3,12 @@ #ifdef HAVE_CUDA //#include "GEMkernels.h" #include "GEMkernels.h" #ifdef HAVE_MPI #include "TNL/Communicators/MpiCommunicator.h" #include "TNL/Communicators/MPITypeResolver.h" #include <mpi.h> #endif Loading Loading @@ -91,92 +94,302 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting #ifdef HAVE_MPI MPI_Comm_rank( MPI_COMM_WORLD, &processID ); MPI_Comm_size( MPI_COMM_WORLD, &numOfProcesses ); Index colPointerMain = 0; #endif Index colPointerMain = 0; while( colPointerMain < x.getSize() ){ #ifdef HAVE_MPI TNL::Communicators::MpiCommunicator::Barrier( MPI_COMM_WORLD ); Index colPointer = colPointerMain-colPointerMain/this->A.getNumRows() * this->A.getNumRows(); Real* mainRow; int size = this->A.getNumColumns() - colPointerMain + 1; mainRow = new Real[size]; if( colPointerMain/this->A.getNumRows() == processID ){ this->A.getRow( colPointer, colPointerMain, mainRow, size ); mainRow[ size-1 ] = this->b.getElement( colPointer ); for( int i = 0; i < numOfProcesses; i++ ) if( i != processID ){ TNL::Communicators::MpiCommunicator::ISend( mainRow, size, i, 0 ); #endif Index colPointer = colPointerMain-(Index)(colPointerMain/this->A.getNumRows()) * this->A.getNumRows(); #ifdef HAVE_MPI if( processID == 0 ) { showMatrix<<< 1, 1 >>>( this->A ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; } } else { TNL::Communicators::MpiCommunicator::Recv( mainRow, size, colPointerMain/this->A.getNumRows(), 0 ); if( verbose > 2 ) MPI_Barrier(MPI_COMM_WORLD); if( processID == 1 ) { printf( "%d: [", processID); for( int i = 0; i < size; i++ ) printf( "%.2f ", mainRow[ i ] ); printf("]\n"); showMatrix<<< 1, 1 >>>( this->A ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; } MPI_Barrier(MPI_COMM_WORLD); if( processID == 2 ) { showMatrix<<< 1, 1 >>>( this->A ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; } TNL::Containers::Vector< Real, TNL::Devices::Host, Index > mainRowVecHost( mainRow, size ); TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > mainRowVec; mainRowVec = mainRowVecHost; delete []mainRow; #else TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > mainRowVec; //printf("%d: colPointer = %d, colPointerMain = %d\n", processID, colPointer, colPointerMain ); this->A.getRow(colPointer, colPointerMain, mainRowVec ); MPI_Barrier(MPI_COMM_WORLD); #endif /*std::cout << processID << ": " << std::endl; std::cout << mainRowVec << std::endl;*/ if( verbose > 1 ) printf( "Elimination: %d/%d\n", colPointer, this->A.getNumColumns() ); // main row vector for computation (pivoting, non-pivoting) TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > mainRowVec; /*if( pivoting == "yes" ) int blockSize = (this->A.getNumColumns()-colPointer) > 1024 ? 1024 : this->A.getNumColumns()-colPointer; int numBlocksOnRow = TNL::roundUpDivision( (this->A.getNumColumns()-colPointer), 1024 ); int numOfBlocks = this->A.getNumRows() * numBlocksOnRow; if( pivoting == "yes" ) { // PIVOTING int reduceBlockSize = (this->A.getNumColumns()-colPointer) > 1024 ? 1024 : TNL::roundToMultiple( this->A.getNumColumns()-colPointer, 32 ); int reduceGridSize = TNL::roundUpDivision( this->A.getNumColumns()-colPointer, reduceBlockSize ); Index fromRow = 0; if( processID < (Index)(colPointerMain/this->A.getNumRows()) ) fromRow = this->A.getNumRows(); else if( processID == (Index)(colPointerMain/this->A.getNumRows()) ) fromRow = colPointer; TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > outMax(1); TNL::Containers::Vector< Index, TNL::Devices::Cuda, Index > outPos(1); outMax.setValue(0); outPos.setValue(-1); if( fromRow != this->A.getNumRows() ) { int reduceBlockSize = (this->A.getNumRows()-fromRow) > 1024 ? 1024 : TNL::roundToMultiple( this->A.getNumRows()-fromRow, 32 ); int reduceGridSize = TNL::roundUpDivision( this->A.getNumRows()-fromRow, reduceBlockSize ); int reduceGridSizeRound = TNL::roundToMultiple( reduceGridSize, 32 ); //printf("%d, %d, %d\n", reduceBlockSize, reduceGridSize, reduceGridSizeRound ); TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > outMax(reduceGridSize); TNL::Containers::Vector< Index, TNL::Devices::Cuda, Index > outPos(reduceGridSize); printf("%d,%d: reduceBlockSize = %d, reduceGridSize = %d, reduceGridSizeRound = %d %d\n", colPointerMain, processID, reduceBlockSize, reduceGridSize, reduceGridSizeRound, fromRow ); outMax.setSize( reduceGridSize ); outPos.setSize( reduceGridSize ); //outMax.setValue(0); outPos.setValue(0); findPivot<<< reduceGridSize, reduceBlockSize >>>( devMat, colPointer, outMax.getView(), outPos.getView() ); findPivot<<< reduceGridSize, reduceBlockSize >>>( devMat, fromRow, colPointerMain, outMax.getView(), outPos.getView() ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; findRowPivot<<< 1, reduceGridSizeRound >>>( outMax.getView(), outPos.getView(), pivot ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; *pom = 0; cudaMemcpy( pom, pivot, sizeof(int), cudaMemcpyDeviceToHost); //*pom = 0; //cudaMemcpy( pom, pivot, sizeof(int), cudaMemcpyDeviceToHost); } //std::cout << processID << ": " << outMax << outPos << std::endl; Real *data, *recvData; data = new Real[2]; recvData = new Real[2*numOfProcesses]; data[0] = outMax.getElement(0); data[1] = outPos.getElement(0); MPI_Barrier( MPI_COMM_WORLD ); MPI_Allgather( data, 2, TNL::Communicators::MPITypeResolver< Real >::getType(), recvData, 2, TNL::Communicators::MPITypeResolver< Real >::getType(), MPI_COMM_WORLD); /*MPI_Barrier( MPI_COMM_WORLD ); if( processID == 0 ) { printf( "%d:", processID ); for( int i = 0; i < 2*numOfProcesses; i++ ) printf( "%.2f ", recvData[i] ); printf("\n"); } MPI_Barrier( MPI_COMM_WORLD ); if( processID == 1 ) { printf( "%d:", processID ); for( int i = 0; i < 2*numOfProcesses; i++ ) printf( "%.2f ", recvData[i] ); printf("\n"); }*/ int blockSize = (this->A.getNumColumns()-colPointer) > 1024 ? 1024 : this->A.getNumColumns()-colPointer; int numBlocksOnRow = TNL::roundUpDivision( (this->A.getNumColumns()-colPointer), 1024 ); int numOfBlocks = this->A.getNumRows() * numBlocksOnRow; TNL::Containers::Vector< Real, TNL::Devices::Host, Index > outMaxHost( numOfProcesses ); TNL::Containers::Vector< Index, TNL::Devices::Host, Index > outPosHost( numOfProcesses ); for( int i = 0; i < numOfProcesses; i++ ) { outMaxHost[ i ] = (Real)recvData[ 2*i ]; outPosHost[ i ] = (Index)recvData[ 2*i+1 ]; } delete []data; delete []recvData; /*if( processID == 0 ) { printf( "%d:", processID ); std::cout << outMaxHost << std::endl; std::cout << outPosHost << std::endl; } MPI_Barrier( MPI_COMM_WORLD ); if( processID == 1 ) { printf( "%d: ", processID ); std::cout << outMaxHost << std::endl; std::cout << outPosHost << std::endl; }*/ Index ProcessMax = -1; Index Possition = -1; Real Maximum = 0; for( int i = 0; i < numOfProcesses; i++ ) { if( outPosHost[ i ] != -1 ) if( Maximum < outMaxHost[ i ] ) { Maximum = outMaxHost[ i ]; ProcessMax = i; Possition = outPosHost[ i ]; } } /*if( processID == 0 ) { printf( "%d: ", processID ); std::cout << ProcessMax << " " << Maximum << " " << Possition << std::endl; } MPI_Barrier( MPI_COMM_WORLD ); if( processID == 1 ) { printf( "%d: ", processID ); std::cout << ProcessMax << " " << Maximum << " " << Possition << std::endl; }*/ Real* mainRow; int size = this->A.getNumColumns() - colPointerMain + 1; mainRow = new Real[size]; /*if( pivoting == "yes" )// && *pom != -1 && *pom != colPointer ) if( ProcessMax != colPointerMain/this->A.getNumRows() ) { if( processID == ProcessMax ) { this->A.getRow( Possition, colPointerMain, mainRow, size ); mainRow[ size-1 ] = this->b.getElement( Possition ); } } else { if( colPointerMain/this->A.getNumRows() == processID ){ if( Possition != colPointer ) { if( verbose > 1 ) { std::cout << std::endl; std::cout << "Choosing element at " << *pom << "-th row as pivot with value..." << std::endl; std::cout << "Swapping " << colPointer << "-th and " << *pom << "-th rows ... " << std::endl; std::cout << "Choosing element at " << Possition << "-th row as pivot with value..." << std::endl; std::cout << "Swapping " << colPointer << "-th and " << Possition << "-th rows ... " << std::endl; } swapRows<<< numBlocksOnRow, blockSize >>>( devMat, device_vector.getView(), colPointer, numBlocksOnRow, pivot ); swapRows<<< numBlocksOnRow, blockSize >>>( devMat, device_vector.getView(), colPointer, numBlocksOnRow, Possition ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; } this->A.getRow( colPointer, colPointerMain, mainRow, size ); mainRow[ size-1 ] = this->b.getElement( colPointer ); } } TNL::Communicators::MpiCommunicator::Bcast( mainRow, size, colPointerMain/this->A.getNumRows(), MPI_COMM_WORLD); TNL::Containers::Vector< Real, TNL::Devices::Host, Index > mainRowVecHost( mainRow, size ); mainRowVec = mainRowVecHost; delete []mainRow; if( ProcessMax != colPointerMain/this->A.getNumRows() ) { Real *mainRowSwap; mainRowSwap = new Real[size]; if( processID == ProcessMax ) { TNL::Communicators::MpiCommunicator::Recv( mainRowSwap, size, colPointerMain/this->A.getNumRows(), 0 ); printf( "%d: ", processID ); for( int i = 0; i < size; i++ ) printf( "%.2f ", mainRowSwap[ i ] ); printf( "\n" ); this->A.setRow( Possition, colPointerMain, mainRowSwap, size ); this->b.setElement( Possition, mainRowSwap[ size-1 ] ); } else if( processID == colPointerMain/this->A.getNumRows() ) { this->A.getRow( colPointer, colPointerMain, mainRowSwap, size ); mainRowSwap[ size-1 ] = this->b.getElement( colPointer ); TNL::Communicators::MpiCommunicator::ISend( mainRowSwap, size, ProcessMax, 0 ); printf( "%d: ", processID ); for( int i = 0; i < size; i++ ) printf( "%.2f ", mainRow[ i ] ); printf( "\n" ); this->A.setRow( colPointer, colPointerMain, mainRow, size ); this->b.setElement( colPointer, mainRow[ size-1 ] ); } delete []mainRowSwap; } #ifdef HAVE_MPI if( processID == 0 ) { showMatrix<<< 1, 1 >>>( this->A ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; } MPI_Barrier(MPI_COMM_WORLD); if( processID == 1 ) { showMatrix<<< 1, 1 >>>( this->A ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; } MPI_Barrier(MPI_COMM_WORLD); if( processID == 2 ) { showMatrix<<< 1, 1 >>>( this->A ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; } MPI_Barrier(MPI_COMM_WORLD); if( processID == 0 ) std::cout << this->b << std::endl; MPI_Barrier(MPI_COMM_WORLD); if( processID == 1 ) std::cout << this->b << std::endl; MPI_Barrier(MPI_COMM_WORLD); if( processID == 2 ) std::cout << this->b << std::endl; #endif } else { #ifdef HAVE_MPI Real* mainRow; int size = this->A.getNumColumns() - colPointerMain + 1; mainRow = new Real[size]; if( colPointerMain/this->A.getNumRows() == processID ){ this->A.getRow( colPointer, colPointerMain, mainRow, size ); mainRow[ size-1 ] = this->b.getElement( colPointer ); /*for( int i = 0; i < numOfProcesses; i++ ) if( i != processID ){ TNL::Communicators::MpiCommunicator::ISend( mainRow, size, i, 0 ); }*/ } //else { //TNL::Communicators::MpiCommunicator::Recv( mainRow, size, colPointerMain/this->A.getNumRows(), 0 ); TNL::Communicators::MpiCommunicator::Bcast( mainRow, size, colPointerMain/this->A.getNumRows(), MPI_COMM_WORLD); if( verbose > 2 ) { printf( "%d: [", processID); for( int i = 0; i < size; i++ ) printf( "%.2f ", mainRow[ i ] ); printf("]\n"); } //} TNL::Containers::Vector< Real, TNL::Devices::Host, Index > mainRowVecHost( mainRow, size ); mainRowVec = mainRowVecHost; delete []mainRow; #else //printf("%d: colPointer = %d, colPointerMain = %d\n", processID, colPointer, colPointerMain ); this->A.getRow(colPointer, colPointerMain, mainRowVec ); #endif } /*std::cout << processID << ": " << std::endl; std::cout << mainRowVec << std::endl;*/ if( verbose > 1 ) printf( "Elimination: %d/%d\n", colPointer, this->A.getNumColumns() ); GEMmainKernel<<< numOfBlocks, blockSize >>>( devMat, device_vector.getView(), Loading
src/Benchmarks/GEM/gem/GEMkernels.h +5 −5 Original line number Diff line number Diff line Loading @@ -60,10 +60,10 @@ __inline__ __device__ void blockReduceArgMax(Real& val, int& index) template <typename Real > __global__ void findPivot( Matrix< Real, TNL::Devices::Cuda, int >* A, int colPointerMain, TNL::Containers::VectorView< Real, TNL::Devices::Cuda, int > outMaximum, int fromRow, int colPointerMain, TNL::Containers::VectorView< Real, TNL::Devices::Cuda, int > outMaximum, TNL::Containers::VectorView< int, TNL::Devices::Cuda, int > outPosition) { int rowPointer = threadIdx.x + blockDim.x * blockIdx.x + colPointerMain; int rowPointer = threadIdx.x + blockDim.x * blockIdx.x + fromRow; Real firstElementInRow = rowPointer >= A->getNumRows() ? 0 : TNL::abs( A->getElement(rowPointer, colPointerMain) ); int index = rowPointer; blockReduceArgMax( firstElementInRow, index ); Loading Loading @@ -95,12 +95,12 @@ template <typename Real > __global__ void swapRows( Matrix< Real, TNL::Devices::Cuda, int >* A, TNL::Containers::VectorView< Real, TNL::Devices::Cuda, int > b, int colPointerMain, int numBlocksOnRow, int* positionPivot ) int colPointerMain, int numBlocksOnRow, int positionPivot ) { if( *positionPivot > colPointerMain ) if( positionPivot > colPointerMain ) { int rowPointer1 = colPointerMain; int rowPointer2 = *positionPivot; int rowPointer2 = positionPivot; int colPointer = threadIdx.x + blockDim.x * (blockIdx.x % numBlocksOnRow) + colPointerMain; if( colPointer < A->getNumColumns() && rowPointer1 < A->getNumRows() ) { Loading