Loading src/Benchmarks/GEM/Matrix/Matrix.h +10 −1 Original line number Diff line number Diff line Loading @@ -70,9 +70,18 @@ class Matrix */ void getRow( Index row, Index col, Real* mainRow, Index size ); /** * Returns ROW on row and starting column. Can be * called from host only for mainRow alocated on CPU. * * @param row and column, mainRow is array with size to be filled with values. * @return void. */ void getRowGPU( Index row, Index col, Real* mainRow, Index size ); /** * Sets ROW on row and starting column into matrix A. Can be * called from host only. * called from host only for mainRow alocated on GPU. * * @param row and column, mainRow is array with size to be filled with values. * @return void. Loading src/Benchmarks/GEM/Matrix/Matrix_impl.h +42 −3 Original line number Diff line number Diff line Loading @@ -128,6 +128,32 @@ void Matrix< Real, Device, Index >::getRow( Index row, Index col, Real* mainRow, #endif } template < typename Real, typename Device, typename Index > void Matrix< Real, Device, Index >::getRowGPU( Index row, Index col, Real* mainRow, Index size ) { TNL_ASSERT( row > -1 && col > -1, std::cerr << "Matrix cannot have egative row nor negative column!"); TNL_ASSERT( row < rows && col < columns, std::cerr << "Matrix dosn't have that much rows or columns!"); #ifdef HAVE_CUDA if( std::is_same< Device, TNL::Devices::Cuda >::value ) { Matrix< Real, TNL::Devices::Cuda, Index >* devMat; cudaMalloc( ( void** ) &devMat, ( size_t ) sizeof( Matrix< Real, TNL::Devices::Cuda, Index > ) ); cudaMemcpy( ( void* ) devMat,( void* ) this, sizeof( Matrix< Real, TNL::Devices::Cuda, Index > ), cudaMemcpyHostToDevice ); TNL_CHECK_CUDA_DEVICE; int blockSize = size-1 > 256 ? 256: size-1; int gridSize = TNL::roundToMultiple( size-1, blockSize ); fillArray<<< gridSize, blockSize >>>( devMat, mainRow, size-1, row, col ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; cudaFree( devMat ); TNL_CHECK_CUDA_DEVICE; } #endif } template < typename Real, typename Device, typename Index > Loading Loading @@ -188,13 +214,13 @@ void Matrix< Real, Device, Index >::getRow( Index row, Index col, Vector& mainRo #if DEBUG printf("On CPU\n"); #endif for( int i = 0; i < mainRow.getSize()-1; i++ ) for( int i = 0; i < this->getNumColumns()-col; i++ ) mainRow[ i ] = this->getElement( row, col + i ); } #ifdef HAVE_CUDA if( std::is_same< Device, TNL::Devices::Cuda >::value ) { for( int i = 0; i < mainRow.getSize()-1; i++ ) for( int i = 0; i < this->getNumColumns()-col; i++ ) mainRow.setElement(i, this->data.getElement( row*TNL::roundToMultiple( this->columns, TNL::Cuda::getWarpSize() ) + col + i ) ); } #endif Loading Loading @@ -293,3 +319,16 @@ Matrix< Real, Device, Index >::operator=( Matrix< Real, Device2, Index>& matrix #endif // HAVE_CUDA return *this; } #ifdef HAVE_CUDA template < typename Real, typename Index > __global__ void fillArray( Matrix< Real, TNL::Devices::Cuda, int >* A, Real* data, Index size, Index row, Index col ) { int thread = threadIdx.x + blockIdx.x * blockDim.x; if( thread < size ) { data[ thread ] = A->getElement( row, col + thread ); } } #endif No newline at end of file src/Benchmarks/GEM/gem/GEMdeviceMPI.h +17 −22 Original line number Diff line number Diff line Loading @@ -143,7 +143,10 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot // Main pointer to row, over all parts of matrices, colPointerMain in (0 - number of rows) Index colPointerMain = 0; double duration[ this->A.getNumColumns() ]; // Bcast and main row vector and clasic array Real* data; TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > mainRowVec( this->A.getNumColumns() + 1 ); data = mainRowVec.getData(); // Main cycle for all rows across all MPI parts, vector x is the only one with full size on MPI, or use A.getNumColumns() for rectangular matrices. while( colPointerMain < x.getSize() ){ #ifdef HAVE_MPI Loading @@ -154,7 +157,6 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot // main row vector for computation (pivoting, non-pivoting) TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > mainRowVec( this->A.getNumColumns() - colPointerMain + 1 ); // Setting number of threads and blocks for main kernel and for pivoting swapping kernel Loading Loading @@ -330,45 +332,43 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot } else // without pivoting { std::clock_t start; start = std::clock(); #ifdef HAVE_MPI //if( processID == 0 ) //printf( "Initializing mainRow!\n"); Real* data; if( colPointerMain/this->A.getNumRows() == processID ){ this->A.getRow( colPointer, colPointerMain, mainRowVec ); mainRowVec.setElement( mainRowVec.getSize()-1, this->b.getElement( colPointer ) ); this->A.getRowGPU( colPointer, colPointerMain, data, this->A.getNumColumns() - colPointerMain+1 ); data = mainRowVec.getData(); mainRowVec.setElement( this->A.getNumColumns() - colPointerMain, this->b.getElement( colPointer ) ); } else { cudaMalloc( &data, mainRowVec.getSize() * sizeof(Real) ); if( verbose > 3 ){ showData<<<1,1>>>(data,mainRowVec.getSize(),processID ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; } //printf( "brodcasting mainRow!\n"); TNL::Communicators::MpiCommunicator::Bcast( data, mainRowVec.getSize(), colPointerMain/this->A.getNumRows(), MPI_COMM_WORLD); TNL::Communicators::MpiCommunicator::Bcast( data, this->A.getNumColumns() + 1, colPointerMain/this->A.getNumRows(), MPI_COMM_WORLD); mainRowVec.bind( data, this->A.getNumColumns() - colPointerMain + 1 ); //mainRowVec.bind( data, this->A.getNumColumns() + 1 ); /*if( verbose > 2 ) if( verbose > 2 ) { for( int i = 0; i < numOfProcesses; i++ ) if( i == processID ){ std::cout << mainRowVec << std::endl; } MPI_Barrier(MPI_COMM_WORLD); }*/ } #else this->A.getRow(colPointer, colPointerMain, mainRowVec ); mainRowVec.setElement( mainRowVec.getSize() - 1, this->b.getElement( colPointer ) ); #endif duration[ colPointerMain ] = ( std::clock() - start ) / (double) CLOCKS_PER_SEC; } //printf("computing \n"); if( verbose > 1 ) { #ifdef HAVE_MPI Loading Loading @@ -437,11 +437,6 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot // increment colPointerMain for next while passage colPointerMain++; } double time; for( int i = 0; i < this->A.getNumColumns(); i++ ) time += duration[ i ]; time = time/this->A.getNumColumns(); printf("%d: copy MPI part: %.8f\n", processID, time ); // delete all used variables cudaFree(pivot); cudaFree( devMat ); Loading src/Benchmarks/GEM/gem/GEMkernelsMPI.h +1 −1 Original line number Diff line number Diff line Loading @@ -153,7 +153,7 @@ void GEMmainKernel( Matrix< Real, TNL::Devices::Cuda, int >* A, if( colPointer == colPointerMain && rowPointer + processID * A->getNumRows() != colPointerMain && rowPointer < A->getNumRows() && colPointer < A->getNumColumns() && mainRow[ 0 ] != 0 && A->getElement( rowPointer, colPointerMain ) != 0 ) { b[ rowPointer ] = b[ rowPointer ] - A->getElement( rowPointer, colPointerMain ) * mainRow[ mainRow.getSize()-1 ] / mainRow[ 0 ]; b[ rowPointer ] = b[ rowPointer ] - A->getElement( rowPointer, colPointerMain ) * mainRow[ A->getNumColumns() - colPointerMain ] / mainRow[ 0 ]; } } Loading Loading
src/Benchmarks/GEM/Matrix/Matrix.h +10 −1 Original line number Diff line number Diff line Loading @@ -70,9 +70,18 @@ class Matrix */ void getRow( Index row, Index col, Real* mainRow, Index size ); /** * Returns ROW on row and starting column. Can be * called from host only for mainRow alocated on CPU. * * @param row and column, mainRow is array with size to be filled with values. * @return void. */ void getRowGPU( Index row, Index col, Real* mainRow, Index size ); /** * Sets ROW on row and starting column into matrix A. Can be * called from host only. * called from host only for mainRow alocated on GPU. * * @param row and column, mainRow is array with size to be filled with values. * @return void. Loading
src/Benchmarks/GEM/Matrix/Matrix_impl.h +42 −3 Original line number Diff line number Diff line Loading @@ -128,6 +128,32 @@ void Matrix< Real, Device, Index >::getRow( Index row, Index col, Real* mainRow, #endif } template < typename Real, typename Device, typename Index > void Matrix< Real, Device, Index >::getRowGPU( Index row, Index col, Real* mainRow, Index size ) { TNL_ASSERT( row > -1 && col > -1, std::cerr << "Matrix cannot have egative row nor negative column!"); TNL_ASSERT( row < rows && col < columns, std::cerr << "Matrix dosn't have that much rows or columns!"); #ifdef HAVE_CUDA if( std::is_same< Device, TNL::Devices::Cuda >::value ) { Matrix< Real, TNL::Devices::Cuda, Index >* devMat; cudaMalloc( ( void** ) &devMat, ( size_t ) sizeof( Matrix< Real, TNL::Devices::Cuda, Index > ) ); cudaMemcpy( ( void* ) devMat,( void* ) this, sizeof( Matrix< Real, TNL::Devices::Cuda, Index > ), cudaMemcpyHostToDevice ); TNL_CHECK_CUDA_DEVICE; int blockSize = size-1 > 256 ? 256: size-1; int gridSize = TNL::roundToMultiple( size-1, blockSize ); fillArray<<< gridSize, blockSize >>>( devMat, mainRow, size-1, row, col ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; cudaFree( devMat ); TNL_CHECK_CUDA_DEVICE; } #endif } template < typename Real, typename Device, typename Index > Loading Loading @@ -188,13 +214,13 @@ void Matrix< Real, Device, Index >::getRow( Index row, Index col, Vector& mainRo #if DEBUG printf("On CPU\n"); #endif for( int i = 0; i < mainRow.getSize()-1; i++ ) for( int i = 0; i < this->getNumColumns()-col; i++ ) mainRow[ i ] = this->getElement( row, col + i ); } #ifdef HAVE_CUDA if( std::is_same< Device, TNL::Devices::Cuda >::value ) { for( int i = 0; i < mainRow.getSize()-1; i++ ) for( int i = 0; i < this->getNumColumns()-col; i++ ) mainRow.setElement(i, this->data.getElement( row*TNL::roundToMultiple( this->columns, TNL::Cuda::getWarpSize() ) + col + i ) ); } #endif Loading Loading @@ -293,3 +319,16 @@ Matrix< Real, Device, Index >::operator=( Matrix< Real, Device2, Index>& matrix #endif // HAVE_CUDA return *this; } #ifdef HAVE_CUDA template < typename Real, typename Index > __global__ void fillArray( Matrix< Real, TNL::Devices::Cuda, int >* A, Real* data, Index size, Index row, Index col ) { int thread = threadIdx.x + blockIdx.x * blockDim.x; if( thread < size ) { data[ thread ] = A->getElement( row, col + thread ); } } #endif No newline at end of file
src/Benchmarks/GEM/gem/GEMdeviceMPI.h +17 −22 Original line number Diff line number Diff line Loading @@ -143,7 +143,10 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot // Main pointer to row, over all parts of matrices, colPointerMain in (0 - number of rows) Index colPointerMain = 0; double duration[ this->A.getNumColumns() ]; // Bcast and main row vector and clasic array Real* data; TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > mainRowVec( this->A.getNumColumns() + 1 ); data = mainRowVec.getData(); // Main cycle for all rows across all MPI parts, vector x is the only one with full size on MPI, or use A.getNumColumns() for rectangular matrices. while( colPointerMain < x.getSize() ){ #ifdef HAVE_MPI Loading @@ -154,7 +157,6 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot // main row vector for computation (pivoting, non-pivoting) TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > mainRowVec( this->A.getNumColumns() - colPointerMain + 1 ); // Setting number of threads and blocks for main kernel and for pivoting swapping kernel Loading Loading @@ -330,45 +332,43 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot } else // without pivoting { std::clock_t start; start = std::clock(); #ifdef HAVE_MPI //if( processID == 0 ) //printf( "Initializing mainRow!\n"); Real* data; if( colPointerMain/this->A.getNumRows() == processID ){ this->A.getRow( colPointer, colPointerMain, mainRowVec ); mainRowVec.setElement( mainRowVec.getSize()-1, this->b.getElement( colPointer ) ); this->A.getRowGPU( colPointer, colPointerMain, data, this->A.getNumColumns() - colPointerMain+1 ); data = mainRowVec.getData(); mainRowVec.setElement( this->A.getNumColumns() - colPointerMain, this->b.getElement( colPointer ) ); } else { cudaMalloc( &data, mainRowVec.getSize() * sizeof(Real) ); if( verbose > 3 ){ showData<<<1,1>>>(data,mainRowVec.getSize(),processID ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; } //printf( "brodcasting mainRow!\n"); TNL::Communicators::MpiCommunicator::Bcast( data, mainRowVec.getSize(), colPointerMain/this->A.getNumRows(), MPI_COMM_WORLD); TNL::Communicators::MpiCommunicator::Bcast( data, this->A.getNumColumns() + 1, colPointerMain/this->A.getNumRows(), MPI_COMM_WORLD); mainRowVec.bind( data, this->A.getNumColumns() - colPointerMain + 1 ); //mainRowVec.bind( data, this->A.getNumColumns() + 1 ); /*if( verbose > 2 ) if( verbose > 2 ) { for( int i = 0; i < numOfProcesses; i++ ) if( i == processID ){ std::cout << mainRowVec << std::endl; } MPI_Barrier(MPI_COMM_WORLD); }*/ } #else this->A.getRow(colPointer, colPointerMain, mainRowVec ); mainRowVec.setElement( mainRowVec.getSize() - 1, this->b.getElement( colPointer ) ); #endif duration[ colPointerMain ] = ( std::clock() - start ) / (double) CLOCKS_PER_SEC; } //printf("computing \n"); if( verbose > 1 ) { #ifdef HAVE_MPI Loading Loading @@ -437,11 +437,6 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot // increment colPointerMain for next while passage colPointerMain++; } double time; for( int i = 0; i < this->A.getNumColumns(); i++ ) time += duration[ i ]; time = time/this->A.getNumColumns(); printf("%d: copy MPI part: %.8f\n", processID, time ); // delete all used variables cudaFree(pivot); cudaFree( devMat ); Loading
src/Benchmarks/GEM/gem/GEMkernelsMPI.h +1 −1 Original line number Diff line number Diff line Loading @@ -153,7 +153,7 @@ void GEMmainKernel( Matrix< Real, TNL::Devices::Cuda, int >* A, if( colPointer == colPointerMain && rowPointer + processID * A->getNumRows() != colPointerMain && rowPointer < A->getNumRows() && colPointer < A->getNumColumns() && mainRow[ 0 ] != 0 && A->getElement( rowPointer, colPointerMain ) != 0 ) { b[ rowPointer ] = b[ rowPointer ] - A->getElement( rowPointer, colPointerMain ) * mainRow[ mainRow.getSize()-1 ] / mainRow[ 0 ]; b[ rowPointer ] = b[ rowPointer ] - A->getElement( rowPointer, colPointerMain ) * mainRow[ A->getNumColumns() - colPointerMain ] / mainRow[ 0 ]; } } Loading