Loading src/Benchmarks/GEM/Matrix/Matrix.h +9 −0 Original line number Diff line number Diff line Loading @@ -88,6 +88,15 @@ class Matrix */ void setRow( Index row, Index col, Real* mainRow, Index size ); /** * Sets ROW on row and starting column into matrix A. Can be * called from host only for mainRow alocated on GPU. * * @param row and column, mainRow is array with size to be filled with values. * @return void. */ void setRowGPU( Index row, Index col, Real* mainRow, Index size ); /** * Sets ROW on row and starting column into matrix A. Can be * called from host for host and device vector. Loading src/Benchmarks/GEM/Matrix/Matrix_impl.h +38 −0 Original line number Diff line number Diff line Loading @@ -149,6 +149,33 @@ void Matrix< Real, Device, Index >::getRowGPU( Index row, Index col, Real* mainR cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; cudaFree( devMat ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; } #endif } template < typename Real, typename Device, typename Index > void Matrix< Real, Device, Index >::setRowGPU( Index row, Index col, Real* mainRow, Index size ) { TNL_ASSERT( row > -1 && col > -1, std::cerr << "Matrix cannot have egative row nor negative column!"); TNL_ASSERT( row < rows && col < columns, std::cerr << "Matrix dosn't have that much rows or columns!"); #ifdef HAVE_CUDA if( std::is_same< Device, TNL::Devices::Cuda >::value ) { Matrix< Real, TNL::Devices::Cuda, Index >* devMat; cudaMalloc( ( void** ) &devMat, ( size_t ) sizeof( Matrix< Real, TNL::Devices::Cuda, Index > ) ); cudaMemcpy( ( void* ) devMat,( void* ) this, sizeof( Matrix< Real, TNL::Devices::Cuda, Index > ), cudaMemcpyHostToDevice ); TNL_CHECK_CUDA_DEVICE; int blockSize = size-1 > 256 ? 256: size-1; int gridSize = TNL::roundToMultiple( size-1, blockSize ); fillRowMatrix<<< gridSize, blockSize >>>( devMat, mainRow, size-1, row, col ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; cudaFree( devMat ); TNL_CHECK_CUDA_DEVICE; } #endif Loading Loading @@ -331,4 +358,15 @@ fillArray( Matrix< Real, TNL::Devices::Cuda, int >* A, Real* data, Index size, I data[ thread ] = A->getElement( row, col + thread ); } } template < typename Real, typename Index > __global__ void fillRowMatrix( Matrix< Real, TNL::Devices::Cuda, int >* A, Real* data, Index size, Index row, Index col ) { int thread = threadIdx.x + blockIdx.x * blockDim.x; if( thread < size ) { A->setElement( row, col + thread, data[ thread ] ); } } #endif No newline at end of file src/Benchmarks/GEM/gem.h +1 −1 Original line number Diff line number Diff line Loading @@ -141,7 +141,7 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve } else{ //("%d: returning\n", processID ); vectorResult.setValue( 0 ); //vectorResult.setValue( 0 ); return vectorResult; } } Loading src/Benchmarks/GEM/gem/GEMdeviceMPI.h +30 −39 Original line number Diff line number Diff line Loading @@ -101,7 +101,7 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& device_vector( this->b ); // FOR PIVOTING SET VARIABLES ON DEVICE int* pivot; cudaMalloc(&pivot, sizeof(int)); //int* pivot; cudaMalloc(&pivot, sizeof(int)); // Initialise MPI variables even without MPI int processID=0; Loading Loading @@ -144,9 +144,8 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot Index colPointerMain = 0; // Bcast and main row vector and clasic array Real* data; TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > mainRowVec( this->A.getNumColumns() + 1 ); data = mainRowVec.getData(); Real* data = mainRowVec.getData(); // Main cycle for all rows across all MPI parts, vector x is the only one with full size on MPI, or use A.getNumColumns() for rectangular matrices. while( colPointerMain < x.getSize() ){ #ifdef HAVE_MPI Loading Loading @@ -195,7 +194,7 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; findRowPivot<<< 1, reduceGridSizeRound >>>( outMax.getView(), outPos.getView(), pivot ); findRowPivot<<< 1, reduceGridSizeRound >>>( outMax.getView(), outPos.getView() ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; } Loading @@ -205,13 +204,13 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot // data stores information to send from each process // recvData stores information that is received #ifdef HAVE_MPI Real *data, *recvData; data = new Real[2]; Real *sendData, *recvData; sendData = new Real[2]; recvData = new Real[2*numOfProcesses]; data[0] = outMax.getElement(0); data[1] = outPos.getElement(0); sendData[0] = outMax.getElement(0); sendData[1] = outPos.getElement(0); MPI_Barrier( MPI_COMM_WORLD ); MPI_Allgather( data, 2, TNL::Communicators::MPITypeResolver< Real >::getType(), MPI_Allgather( sendData, 2, TNL::Communicators::MPITypeResolver< Real >::getType(), recvData, 2, TNL::Communicators::MPITypeResolver< Real >::getType(), MPI_COMM_WORLD); #endif Loading @@ -235,7 +234,7 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot if( verbose > 1 && processID == 0 ) printf("%d: max = %.2f, possition = %d, process = %d\n", colPointerMain, Maximum, Possition, ProcessMax ); // All processes has the info in Maximum, ProcesMax and Possition. So deleting arrays. delete []data; delete []sendData; delete []recvData; #endif // Clasic Maximum == 0 then we occured zero pivot so ending this calculation Loading @@ -248,7 +247,7 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot // Now when every process has the ProcessMax of pivoting row across all processes // we can send pivoting row to all processes from ProcessMax // mainRow stores pivoting row Array mainRow( this->A.getNumColumns() - colPointerMain + 1 ); //Array mainRow( this->A.getNumColumns() - colPointerMain + 1 ); // If ProcessMax isn't the main process that contains colPointerMain then ProcessMax sets mainRow itself. Loading @@ -258,8 +257,8 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot { if( processID == ProcessMax ) { this->A.getRow( Possition, colPointerMain, mainRow ); mainRow.setElement( mainRow.getSize()-1, this->b.getElement( Possition ) ); this->A.getRowGPU( Possition, colPointerMain, data, this->A.getNumColumns() - colPointerMain+1 ); mainRowVec.setElement( this->A.getNumColumns() - colPointerMain, this->b.getElement( Possition ) ); } } else { if( colPointerMain/this->A.getNumRows() == processID ){ Loading @@ -279,55 +278,46 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot TNL_CHECK_CUDA_DEVICE; } this->A.getRow( colPointer, colPointerMain, mainRow ); mainRow.setElement( mainRow.getSize()-1, this->b.getElement( colPointer ) ); this->A.getRowGPU( colPointer, colPointerMain, data, this->A.getNumColumns() - colPointerMain+1 ); mainRowVec.setElement( this->A.getNumColumns() - colPointerMain, this->b.getElement( colPointer ) ); } } // Broad casting the pivoting row to all processes #ifdef HAVE_MPI MPI_Barrier(MPI_COMM_WORLD); TNL::Communicators::MpiCommunicator::Bcast( mainRow.getData(), mainRow.getSize()-1, ProcessMax, MPI_COMM_WORLD); //if( colPointerMain%100 == 0 ) // saveVec( mainRow, mainRow.getSize(), processID, colPointerMain ); if( verbose > 1 ) { printf( "%d: [", processID); for( int i = 0; i < mainRow.getSize(); i++ ) printf( "%.2f ", mainRow.getElement( i ) ); printf("]\n"); } TNL::Communicators::MpiCommunicator::Bcast( data, this->A.getNumColumns()+1, ProcessMax, MPI_COMM_WORLD); // Onec more if the ProcessMax filled the mainRow, then the ProcessMax needs to switch this pivoting row with main process. // mainRowSwap is the colPointer of process colPointerMain/this->A.getNumRows() if( ProcessMax != colPointerMain/this->A.getNumRows() ) { Real *mainRowSwap; mainRowSwap = new Real[mainRow.getSize()]; Array mainRowSwapVec( this->A.getNumColumns() - colPointerMain+1 ); Real* mainRowSwap = mainRowSwapVec.getData(); if( processID == ProcessMax ) { TNL::Communicators::MpiCommunicator::Recv( mainRowSwap, mainRow.getSize(), colPointerMain/this->A.getNumRows(), 0 ); this->A.setRow( Possition, colPointerMain, mainRowSwap, mainRow.getSize() ); this->b.setElement( Possition, mainRowSwap[ mainRow.getSize()-1 ] ); TNL::Communicators::MpiCommunicator::Recv( mainRowSwap, this->A.getNumColumns() - colPointerMain+1, colPointerMain/this->A.getNumRows(), 0 ); this->A.setRowGPU( Possition, colPointerMain, mainRowSwap, this->A.getNumColumns() - colPointerMain+1 ); this->b.setElement( Possition, mainRowSwapVec.getElement( mainRowSwapVec.getSize()-1 ) ); } else if( processID == colPointerMain/this->A.getNumRows() ) { this->A.getRow( colPointer, colPointerMain, mainRowSwap, mainRow.getSize() ); mainRowSwap[ mainRow.getSize()-1 ] = this->b.getElement( colPointer ); this->A.getRowGPU( colPointer, colPointerMain, mainRowSwap, this->A.getNumColumns() - colPointerMain+1 ); mainRowSwapVec.setElement( mainRowSwapVec.getSize() - 1, this->b.getElement( colPointer )); TNL::Communicators::MpiCommunicator::Send( mainRowSwap, mainRow.getSize(), ProcessMax, 0 ); this->A.setRow( colPointer, colPointerMain, mainRow ); this->b.setElement( colPointer, mainRow[ mainRow.getSize()-1 ] ); TNL::Communicators::MpiCommunicator::Send( mainRowSwap, mainRowSwapVec.getSize(), ProcessMax, 0 ); this->A.setRowGPU( colPointer, colPointerMain, data, this->A.getNumColumns() - colPointerMain+1 ); this->b.setElement( colPointer, mainRowVec.getElement( mainRowSwapVec.getSize()-1 ) ); } delete []mainRowSwap; //delete []mainRowSwap; } #endif // Main kernel works with vector as a main row, so all processes has to set mainRowVec. //TNL::Containers::Vector< Real, TNL::Devices::Host, Index > mainRowVecHost( mainRow, size ); mainRowVec = mainRow; //mainRowVec = mainRow; //delete []mainRow; } else // without pivoting Loading @@ -339,8 +329,9 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot if( colPointerMain/this->A.getNumRows() == processID ){ this->A.getRowGPU( colPointer, colPointerMain, data, this->A.getNumColumns() - colPointerMain+1 ); mainRowVec.setElement( this->A.getNumColumns() - colPointerMain, this->b.getElement( colPointer ) ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; } if( verbose > 3 ){ Loading Loading @@ -438,9 +429,9 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot colPointerMain++; } // delete all used variables cudaFree(pivot); cudaFree( devMat ); TNL_CHECK_CUDA_DEVICE; //delete &data; // Calculate real result // (With MPI needs to send info into process 0 as main process with real result, rest processes has result as vector of zeros) Loading src/Benchmarks/GEM/gem/GEMkernelsMPI.h +1 −5 Original line number Diff line number Diff line Loading @@ -85,16 +85,12 @@ void findPivot( Matrix< Real, TNL::Devices::Cuda, int >* A, template <typename Real > __global__ void findRowPivot( TNL::Containers::VectorView< Real, TNL::Devices::Cuda, int > outMaximum, TNL::Containers::VectorView< int, TNL::Devices::Cuda, int > outPosition, int* positionPivot ) TNL::Containers::VectorView< int, TNL::Devices::Cuda, int > outPosition ) { int rowPointer = threadIdx.x; Real firstElementInRow = rowPointer >= outMaximum.getSize() ? 0 : outMaximum[ rowPointer ]; int index = rowPointer >= outPosition.getSize() ? 0 : outPosition[ rowPointer ]; blockReduceArgMax( firstElementInRow, index ); if( threadIdx.x == 0 ) { *positionPivot = index; } } Loading Loading
src/Benchmarks/GEM/Matrix/Matrix.h +9 −0 Original line number Diff line number Diff line Loading @@ -88,6 +88,15 @@ class Matrix */ void setRow( Index row, Index col, Real* mainRow, Index size ); /** * Sets ROW on row and starting column into matrix A. Can be * called from host only for mainRow alocated on GPU. * * @param row and column, mainRow is array with size to be filled with values. * @return void. */ void setRowGPU( Index row, Index col, Real* mainRow, Index size ); /** * Sets ROW on row and starting column into matrix A. Can be * called from host for host and device vector. Loading
src/Benchmarks/GEM/Matrix/Matrix_impl.h +38 −0 Original line number Diff line number Diff line Loading @@ -149,6 +149,33 @@ void Matrix< Real, Device, Index >::getRowGPU( Index row, Index col, Real* mainR cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; cudaFree( devMat ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; } #endif } template < typename Real, typename Device, typename Index > void Matrix< Real, Device, Index >::setRowGPU( Index row, Index col, Real* mainRow, Index size ) { TNL_ASSERT( row > -1 && col > -1, std::cerr << "Matrix cannot have egative row nor negative column!"); TNL_ASSERT( row < rows && col < columns, std::cerr << "Matrix dosn't have that much rows or columns!"); #ifdef HAVE_CUDA if( std::is_same< Device, TNL::Devices::Cuda >::value ) { Matrix< Real, TNL::Devices::Cuda, Index >* devMat; cudaMalloc( ( void** ) &devMat, ( size_t ) sizeof( Matrix< Real, TNL::Devices::Cuda, Index > ) ); cudaMemcpy( ( void* ) devMat,( void* ) this, sizeof( Matrix< Real, TNL::Devices::Cuda, Index > ), cudaMemcpyHostToDevice ); TNL_CHECK_CUDA_DEVICE; int blockSize = size-1 > 256 ? 256: size-1; int gridSize = TNL::roundToMultiple( size-1, blockSize ); fillRowMatrix<<< gridSize, blockSize >>>( devMat, mainRow, size-1, row, col ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; cudaFree( devMat ); TNL_CHECK_CUDA_DEVICE; } #endif Loading Loading @@ -331,4 +358,15 @@ fillArray( Matrix< Real, TNL::Devices::Cuda, int >* A, Real* data, Index size, I data[ thread ] = A->getElement( row, col + thread ); } } template < typename Real, typename Index > __global__ void fillRowMatrix( Matrix< Real, TNL::Devices::Cuda, int >* A, Real* data, Index size, Index row, Index col ) { int thread = threadIdx.x + blockIdx.x * blockDim.x; if( thread < size ) { A->setElement( row, col + thread, data[ thread ] ); } } #endif No newline at end of file
src/Benchmarks/GEM/gem.h +1 −1 Original line number Diff line number Diff line Loading @@ -141,7 +141,7 @@ Vector< Real, Device, Index > runGEM( const String& matrixName, const String& ve } else{ //("%d: returning\n", processID ); vectorResult.setValue( 0 ); //vectorResult.setValue( 0 ); return vectorResult; } } Loading
src/Benchmarks/GEM/gem/GEMdeviceMPI.h +30 −39 Original line number Diff line number Diff line Loading @@ -101,7 +101,7 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index >& device_vector( this->b ); // FOR PIVOTING SET VARIABLES ON DEVICE int* pivot; cudaMalloc(&pivot, sizeof(int)); //int* pivot; cudaMalloc(&pivot, sizeof(int)); // Initialise MPI variables even without MPI int processID=0; Loading Loading @@ -144,9 +144,8 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot Index colPointerMain = 0; // Bcast and main row vector and clasic array Real* data; TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > mainRowVec( this->A.getNumColumns() + 1 ); data = mainRowVec.getData(); Real* data = mainRowVec.getData(); // Main cycle for all rows across all MPI parts, vector x is the only one with full size on MPI, or use A.getNumColumns() for rectangular matrices. while( colPointerMain < x.getSize() ){ #ifdef HAVE_MPI Loading Loading @@ -195,7 +194,7 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; findRowPivot<<< 1, reduceGridSizeRound >>>( outMax.getView(), outPos.getView(), pivot ); findRowPivot<<< 1, reduceGridSizeRound >>>( outMax.getView(), outPos.getView() ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; } Loading @@ -205,13 +204,13 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot // data stores information to send from each process // recvData stores information that is received #ifdef HAVE_MPI Real *data, *recvData; data = new Real[2]; Real *sendData, *recvData; sendData = new Real[2]; recvData = new Real[2*numOfProcesses]; data[0] = outMax.getElement(0); data[1] = outPos.getElement(0); sendData[0] = outMax.getElement(0); sendData[1] = outPos.getElement(0); MPI_Barrier( MPI_COMM_WORLD ); MPI_Allgather( data, 2, TNL::Communicators::MPITypeResolver< Real >::getType(), MPI_Allgather( sendData, 2, TNL::Communicators::MPITypeResolver< Real >::getType(), recvData, 2, TNL::Communicators::MPITypeResolver< Real >::getType(), MPI_COMM_WORLD); #endif Loading @@ -235,7 +234,7 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot if( verbose > 1 && processID == 0 ) printf("%d: max = %.2f, possition = %d, process = %d\n", colPointerMain, Maximum, Possition, ProcessMax ); // All processes has the info in Maximum, ProcesMax and Possition. So deleting arrays. delete []data; delete []sendData; delete []recvData; #endif // Clasic Maximum == 0 then we occured zero pivot so ending this calculation Loading @@ -248,7 +247,7 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot // Now when every process has the ProcessMax of pivoting row across all processes // we can send pivoting row to all processes from ProcessMax // mainRow stores pivoting row Array mainRow( this->A.getNumColumns() - colPointerMain + 1 ); //Array mainRow( this->A.getNumColumns() - colPointerMain + 1 ); // If ProcessMax isn't the main process that contains colPointerMain then ProcessMax sets mainRow itself. Loading @@ -258,8 +257,8 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot { if( processID == ProcessMax ) { this->A.getRow( Possition, colPointerMain, mainRow ); mainRow.setElement( mainRow.getSize()-1, this->b.getElement( Possition ) ); this->A.getRowGPU( Possition, colPointerMain, data, this->A.getNumColumns() - colPointerMain+1 ); mainRowVec.setElement( this->A.getNumColumns() - colPointerMain, this->b.getElement( Possition ) ); } } else { if( colPointerMain/this->A.getNumRows() == processID ){ Loading @@ -279,55 +278,46 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot TNL_CHECK_CUDA_DEVICE; } this->A.getRow( colPointer, colPointerMain, mainRow ); mainRow.setElement( mainRow.getSize()-1, this->b.getElement( colPointer ) ); this->A.getRowGPU( colPointer, colPointerMain, data, this->A.getNumColumns() - colPointerMain+1 ); mainRowVec.setElement( this->A.getNumColumns() - colPointerMain, this->b.getElement( colPointer ) ); } } // Broad casting the pivoting row to all processes #ifdef HAVE_MPI MPI_Barrier(MPI_COMM_WORLD); TNL::Communicators::MpiCommunicator::Bcast( mainRow.getData(), mainRow.getSize()-1, ProcessMax, MPI_COMM_WORLD); //if( colPointerMain%100 == 0 ) // saveVec( mainRow, mainRow.getSize(), processID, colPointerMain ); if( verbose > 1 ) { printf( "%d: [", processID); for( int i = 0; i < mainRow.getSize(); i++ ) printf( "%.2f ", mainRow.getElement( i ) ); printf("]\n"); } TNL::Communicators::MpiCommunicator::Bcast( data, this->A.getNumColumns()+1, ProcessMax, MPI_COMM_WORLD); // Onec more if the ProcessMax filled the mainRow, then the ProcessMax needs to switch this pivoting row with main process. // mainRowSwap is the colPointer of process colPointerMain/this->A.getNumRows() if( ProcessMax != colPointerMain/this->A.getNumRows() ) { Real *mainRowSwap; mainRowSwap = new Real[mainRow.getSize()]; Array mainRowSwapVec( this->A.getNumColumns() - colPointerMain+1 ); Real* mainRowSwap = mainRowSwapVec.getData(); if( processID == ProcessMax ) { TNL::Communicators::MpiCommunicator::Recv( mainRowSwap, mainRow.getSize(), colPointerMain/this->A.getNumRows(), 0 ); this->A.setRow( Possition, colPointerMain, mainRowSwap, mainRow.getSize() ); this->b.setElement( Possition, mainRowSwap[ mainRow.getSize()-1 ] ); TNL::Communicators::MpiCommunicator::Recv( mainRowSwap, this->A.getNumColumns() - colPointerMain+1, colPointerMain/this->A.getNumRows(), 0 ); this->A.setRowGPU( Possition, colPointerMain, mainRowSwap, this->A.getNumColumns() - colPointerMain+1 ); this->b.setElement( Possition, mainRowSwapVec.getElement( mainRowSwapVec.getSize()-1 ) ); } else if( processID == colPointerMain/this->A.getNumRows() ) { this->A.getRow( colPointer, colPointerMain, mainRowSwap, mainRow.getSize() ); mainRowSwap[ mainRow.getSize()-1 ] = this->b.getElement( colPointer ); this->A.getRowGPU( colPointer, colPointerMain, mainRowSwap, this->A.getNumColumns() - colPointerMain+1 ); mainRowSwapVec.setElement( mainRowSwapVec.getSize() - 1, this->b.getElement( colPointer )); TNL::Communicators::MpiCommunicator::Send( mainRowSwap, mainRow.getSize(), ProcessMax, 0 ); this->A.setRow( colPointer, colPointerMain, mainRow ); this->b.setElement( colPointer, mainRow[ mainRow.getSize()-1 ] ); TNL::Communicators::MpiCommunicator::Send( mainRowSwap, mainRowSwapVec.getSize(), ProcessMax, 0 ); this->A.setRowGPU( colPointer, colPointerMain, data, this->A.getNumColumns() - colPointerMain+1 ); this->b.setElement( colPointer, mainRowVec.getElement( mainRowSwapVec.getSize()-1 ) ); } delete []mainRowSwap; //delete []mainRowSwap; } #endif // Main kernel works with vector as a main row, so all processes has to set mainRowVec. //TNL::Containers::Vector< Real, TNL::Devices::Host, Index > mainRowVecHost( mainRow, size ); mainRowVec = mainRow; //mainRowVec = mainRow; //delete []mainRow; } else // without pivoting Loading @@ -339,8 +329,9 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot if( colPointerMain/this->A.getNumRows() == processID ){ this->A.getRowGPU( colPointer, colPointerMain, data, this->A.getNumColumns() - colPointerMain+1 ); mainRowVec.setElement( this->A.getNumColumns() - colPointerMain, this->b.getElement( colPointer ) ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; } if( verbose > 3 ){ Loading Loading @@ -438,9 +429,9 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot colPointerMain++; } // delete all used variables cudaFree(pivot); cudaFree( devMat ); TNL_CHECK_CUDA_DEVICE; //delete &data; // Calculate real result // (With MPI needs to send info into process 0 as main process with real result, rest processes has result as vector of zeros) Loading
src/Benchmarks/GEM/gem/GEMkernelsMPI.h +1 −5 Original line number Diff line number Diff line Loading @@ -85,16 +85,12 @@ void findPivot( Matrix< Real, TNL::Devices::Cuda, int >* A, template <typename Real > __global__ void findRowPivot( TNL::Containers::VectorView< Real, TNL::Devices::Cuda, int > outMaximum, TNL::Containers::VectorView< int, TNL::Devices::Cuda, int > outPosition, int* positionPivot ) TNL::Containers::VectorView< int, TNL::Devices::Cuda, int > outPosition ) { int rowPointer = threadIdx.x; Real firstElementInRow = rowPointer >= outMaximum.getSize() ? 0 : outMaximum[ rowPointer ]; int index = rowPointer >= outPosition.getSize() ? 0 : outPosition[ rowPointer ]; blockReduceArgMax( firstElementInRow, index ); if( threadIdx.x == 0 ) { *positionPivot = index; } } Loading