Commit fb53d7b0 authored by Matouš Fencl's avatar Matouš Fencl Committed by Jakub Klinkovský
Browse files

Mapping of threads in gems (MPI, noMPI) changed.

parent 111cd841
Loading
Loading
Loading
Loading
+20 −11
Original line number Diff line number Diff line
@@ -14,8 +14,8 @@ void calculResultVector( Matrix< Real, TNL::Devices::Cuda, Index >& matrix,
{ 
  Matrix< Real, TNL::Devices::Cuda, Index >* devMat = TNL::Cuda::passToDevice( matrix);
  
  int blockSize = matrix.getNumRows() > 1024 ? 1024 : matrix.getNumColumns();
  int numBlocksOnColumn = TNL::roundUpDivision( matrix.getNumRows(), 1024 );
  int blockSize = matrix.getNumRows() > 256 ? 256 : matrix.getNumColumns();
  int numBlocksOnColumn = TNL::roundUpDivision( matrix.getNumRows(), 256 );
  int numOfBlocks =  matrix.getNumRows() * numBlocksOnColumn;
    
  
@@ -54,10 +54,16 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting
    if( verbose > 1 )
      printf( "Elimination: %d/%d\n", colPointer, this->A.getNumColumns() );
     
    if( verbose > 2 ){
      showMatrix<<< 1, 1 >>>( this->A );
      cudaDeviceSynchronize();
      TNL_CHECK_CUDA_DEVICE;
    }
    
    if( pivoting == "yes" )
    {
      // PIVOTING
      int reduceBlockSize = (this->A.getNumColumns()-colPointer) > 1024 ? 1024 : 
      int reduceBlockSize = (this->A.getNumColumns()-colPointer) > 256 ? 256 : 
        TNL::roundToMultiple( this->A.getNumColumns()-colPointer, 32 );  
      int reduceGridSize = TNL::roundUpDivision( this->A.getNumColumns()-colPointer, reduceBlockSize );
      int reduceGridSizeRound = TNL::roundToMultiple( reduceGridSize, 32 );
@@ -79,9 +85,10 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting
    }
    
    
    int blockSize = (this->A.getNumColumns()-colPointer) > 1024 ? 1024 : this->A.getNumColumns()-colPointer;
    int numBlocksOnRow = TNL::roundUpDivision( (this->A.getNumColumns()-colPointer), 1024 );
    int numOfBlocks =  this->A.getNumRows() * numBlocksOnRow;
    int blockSize = this->A.getNumRows() * (this->A.getNumColumns()-colPointer) > 256 ?
      256 : this->A.getNumRows() * ( this->A.getNumColumns() - colPointer );
    int gridSize =  TNL::roundUpDivision( this->A.getNumRows() * (this->A.getNumColumns()-colPointer), blockSize );
    //printf( "%d: %d, %d\n", colPointer, blockSize, numOfBlocks );
    
    
    if( pivoting == "yes" )// && *pom != -1 && *pom != colPointer )
@@ -92,15 +99,17 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting
         std::cout << "Choosing element at " << *pom << "-th row as pivot with value..."  << std::endl;
         std::cout << "Swapping " << colPointer << "-th and " << *pom <<  "-th rows ... " << std::endl;
      }
      swapRows<<< numBlocksOnRow, blockSize >>>( devMat, device_vector.getView(), colPointer, numBlocksOnRow, pivot );
      int blockSizeSwap = this->A.getNumColumns()-colPointer > 256 ?256 :  this->A.getNumColumns() - colPointer;
      int gridSizeSwap =  TNL::roundUpDivision( this->A.getNumColumns()-colPointer, blockSize );
      
      swapRows<<< gridSizeSwap, blockSizeSwap >>>( devMat, device_vector.getView(), colPointer, pivot );
      cudaDeviceSynchronize();
      TNL_CHECK_CUDA_DEVICE;
    } 
    
    GEMmainKernel<<< numOfBlocks, blockSize >>>( devMat, 
    GEMmainKernel<<< gridSize, blockSize >>>( devMat, 
                                                  device_vector.getView(), 
                                                  colPointer, 
                                                  numBlocksOnRow );
                                                  colPointer );
    cudaDeviceSynchronize();
    TNL_CHECK_CUDA_DEVICE;
  }
+16 −10
Original line number Diff line number Diff line
@@ -43,8 +43,8 @@ void calculResultVector( Matrix< Real, TNL::Devices::Cuda, Index >& matrix,
  cudaMemcpy( ( void* ) devMat,( void* ) &matrix, sizeof( Matrix< Real, TNL::Devices::Cuda, Index > ), cudaMemcpyHostToDevice );
  TNL_CHECK_CUDA_DEVICE;
  
  int blockSize = matrix.getNumRows() > 1024 ? 1024 : matrix.getNumRows();
  int numBlocksOnColumn = TNL::roundUpDivision( matrix.getNumRows(), 1024 );
  int blockSize = matrix.getNumRows() > 256 ? 256 : matrix.getNumRows();
  int numBlocksOnColumn = TNL::roundUpDivision( matrix.getNumRows(), 256 );
  int numOfBlocks =  matrix.getNumRows() * numBlocksOnColumn;
  
  
@@ -155,9 +155,6 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot
    TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > mainRowVec( this->A.getNumColumns() - colPointerMain + 1 );
    
    // Setting number of threads and blocks for main kernel and for pivoting swapping kernel
    int blockSize = (this->A.getNumColumns()-colPointer) > 1024 ? 1024 : this->A.getNumColumns()-colPointer;
    int numBlocksOnRow = TNL::roundUpDivision( (this->A.getNumColumns()-colPointer), 1024 );
    int numOfBlocks =  this->A.getNumRows() * numBlocksOnRow;
        
    
    if( pivoting == "yes" )
@@ -176,11 +173,11 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot
      outMax.setValue(0); outPos.setValue(-1);
      
      
      // those blocks that have rows to look for pivot in should start looking
      // those blocks that have rows to look for pivot in, should start looking
      if( fromRow != this->A.getNumRows() )
      {
        // setting size for kernel of pivoting
        int reduceBlockSize = (this->A.getNumRows()-fromRow) > 1024 ? 1024 : 
        int reduceBlockSize = (this->A.getNumRows()-fromRow) > 256 ? 256 : 
          TNL::roundToMultiple( this->A.getNumRows()-fromRow, 32 );  
        int reduceGridSize = TNL::roundUpDivision( this->A.getNumRows()-fromRow, reduceBlockSize );
        int reduceGridSizeRound = TNL::roundToMultiple( reduceGridSize, 32 );
@@ -272,7 +269,10 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot
              std::cout << "Choosing element at " << Possition << "-th row as pivot with value..."  << std::endl;
              std::cout << "Swapping " << colPointer << "-th and " << Possition <<  "-th rows ... " << std::endl;
            }
            swapRows<<< numBlocksOnRow, blockSize >>>( devMat, device_vector.getView(), colPointer, numBlocksOnRow, Possition );
            int blockSize = this->A.getNumColumns()-colPointerMain > 256 ? 256 : ( this->A.getNumColumns()-colPointerMain );
            int gridSize = TNL::roundUpDivision( this->A.getNumColumns()-colPointerMain, blockSize );
            
            swapRows<<< gridSize, blockSize >>>( devMat, device_vector.getView(), colPointerMain, colPointer, Possition );
            cudaDeviceSynchronize();
            TNL_CHECK_CUDA_DEVICE;
          }
@@ -375,6 +375,13 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot
    }
    if( verbose > 1 )
      printf( "Elimination: %d/%d\n", colPointerMain, this->A.getNumColumns() );
    
    // Setting number of threads and blocks for main kernel
    int blockSize = this->A.getNumRows() * (this->A.getNumColumns()-colPointerMain) > 256 ?
      256 : this->A.getNumRows() * ( this->A.getNumColumns()-colPointerMain );
    int numOfBlocks =  TNL::roundUpDivision( this->A.getNumRows() * (this->A.getNumColumns()-colPointerMain), blockSize );
    
   
    //std::cout << mainRowVec << std::endl;
    // Finally main kernel calculates the GEM for colPointerMain from mainRowVec
    GEMmainKernel<<< numOfBlocks, blockSize >>>( devMat, 
@@ -382,7 +389,6 @@ bool GEM<Real, Device, Index >::GEMdeviceMPI( Array& x, const TNL::String& pivot
            mainRowVec.getView(),
            colPointer, 
            colPointerMain,
            numBlocksOnRow,
            processID,
            numOfProcesses );
    cudaDeviceSynchronize();
+9 −7
Original line number Diff line number Diff line
@@ -95,19 +95,19 @@ template <typename Real >
__global__ 
void swapRows( Matrix< Real, TNL::Devices::Cuda, int >* A, 
        TNL::Containers::VectorView< Real, TNL::Devices::Cuda, int > b,
        int colPointerMain, int numBlocksOnRow, int* positionPivot )
        int colPointerMain, int* positionPivot )
{
  if( *positionPivot > colPointerMain )
  {
    int rowPointer1 = colPointerMain;
    int rowPointer2 = *positionPivot;
    int colPointer = threadIdx.x + blockDim.x * (blockIdx.x % numBlocksOnRow) + colPointerMain;
    int colPointer = threadIdx.x + blockDim.x *blockIdx.x  + colPointerMain;
    if( colPointer < A->getNumColumns() && rowPointer1 < A->getNumRows() )
    {
      Real pom = A->getElement( rowPointer1, colPointer );
      A->setElement( rowPointer1, colPointer, A->getElement( rowPointer2, colPointer ) );
      A->setElement( rowPointer2, colPointer, pom );
      if( colPointer == colPointerMain && blockIdx.x == 0 )
      if( colPointer == colPointerMain )
      {
        pom = b[rowPointer1];
        b[rowPointer1] = b[rowPointer2];
@@ -122,10 +122,12 @@ template <typename Real >
__global__ 
void GEMmainKernel( Matrix< Real, TNL::Devices::Cuda, int >* A,
        TNL::Containers::VectorView< Real, TNL::Devices::Cuda, int > b, 
        int colPointerMain, int numBlocksOnRow )
        int colPointerMain )
{
  int rowPointer = blockIdx.x / numBlocksOnRow;
  int colPointer = threadIdx.x + blockDim.x * (blockIdx.x % numBlocksOnRow) + colPointerMain;
  int thread = threadIdx.x + blockIdx.x * blockDim.x;
  int rowPointer = thread / ( A->getNumRows() - colPointerMain );
  int colPointer = thread % ( A->getNumRows() - colPointerMain ) + colPointerMain;
  //printf("%d, %d\n",rowPointer, colPointer );
  if( colPointer > colPointerMain && colPointer < A->getNumColumns() && rowPointer != colPointerMain && rowPointer < A->getNumRows() )
  {
    if( A->getElement( colPointerMain, colPointerMain ) != 0 )
@@ -139,7 +141,7 @@ void GEMmainKernel( Matrix< Real, TNL::Devices::Cuda, int >* A,
      }
    } else if( colPointer == colPointerMain && rowPointer == colPointerMain ) printf( "Error, pivot is zero!\n");
  }
  if( rowPointer != colPointerMain && threadIdx.x == 0 && blockIdx.x % numBlocksOnRow == 0 && A->getElement( colPointerMain, colPointerMain ) != 0 && A->getElement( rowPointer, colPointerMain ) != 0  )
  if( rowPointer < A->getNumRows() && colPointer < A->getNumColumns() && rowPointer != colPointerMain && colPointer == colPointerMain && A->getElement( colPointerMain, colPointerMain ) != 0 && A->getElement( rowPointer, colPointerMain ) != 0  )
  {
    b[ rowPointer ] = b[ rowPointer ] - A->getElement( rowPointer, colPointerMain ) * b[ colPointerMain ] / A->getElement( colPointerMain, colPointerMain );
  }
+12 −10
Original line number Diff line number Diff line
@@ -95,19 +95,19 @@ template <typename Real >
__global__ 
void swapRows( Matrix< Real, TNL::Devices::Cuda, int >* A, 
        TNL::Containers::VectorView< Real, TNL::Devices::Cuda, int > b,
        int colPointerMain, int numBlocksOnRow, int positionPivot )
        int colPointerMain, int colPointerPom, int positionPivot )
{
  if( positionPivot > colPointerMain )
  if( positionPivot > colPointerPom )
  {
    int rowPointer1 = colPointerMain;
    int rowPointer1 = colPointerPom;
    int rowPointer2 = positionPivot;
    int colPointer = threadIdx.x + blockDim.x * (blockIdx.x % numBlocksOnRow) + colPointerMain;
    int colPointer = threadIdx.x + blockDim.x * blockIdx.x + colPointerMain;
    if( colPointer < A->getNumColumns() && rowPointer1 < A->getNumRows() )
    {
      Real pom = A->getElement( rowPointer1, colPointer );
      A->setElement( rowPointer1, colPointer, A->getElement( rowPointer2, colPointer ) );
      A->setElement( rowPointer2, colPointer, pom );
      if( colPointer == colPointerMain && blockIdx.x == 0 )
      if( colPointer == colPointerMain )
      {
        pom = b[rowPointer1];
        b[rowPointer1] = b[rowPointer2];
@@ -123,10 +123,12 @@ __global__
void GEMmainKernel( Matrix< Real, TNL::Devices::Cuda, int >* A,
        TNL::Containers::VectorView< Real, TNL::Devices::Cuda, int > b,
        TNL::Containers::VectorView< Real, TNL::Devices::Cuda, int > mainRow,
        int rowPointerMain, int colPointerMain, int numBlocksOnRow, int processID, int numOfProcesses )
        int rowPointerMain, int colPointerMain, int processID, int numOfProcesses )
{
  int rowPointer = blockIdx.x / numBlocksOnRow;
  int colPointer = threadIdx.x + blockDim.x * (blockIdx.x % numBlocksOnRow) + colPointerMain;
  int thread = threadIdx.x + blockDim.x * blockIdx.x;
  int rowPointer = thread / ( A->getNumColumns() - colPointerMain );
  int colPointer = thread % ( A->getNumColumns() - colPointerMain ) + colPointerMain;
  
  if( colPointer > colPointerMain && colPointer < A->getNumColumns() && 
          rowPointer + processID * A->getNumRows() != colPointerMain && rowPointer < A->getNumRows() )
  {
@@ -141,8 +143,8 @@ void GEMmainKernel( Matrix< Real, TNL::Devices::Cuda, int >* A,
      }
    } else if( colPointer == colPointerMain && rowPointer == colPointerMain ) printf( "Error, pivot is zero!\n");
  }
  if( threadIdx.x == 0 && blockIdx.x % numBlocksOnRow == 0 && rowPointer + processID * A->getNumRows() != colPointerMain && rowPointer < A->getNumRows() &&
           mainRow[ 0 ] != 0 && A->getElement( rowPointer, colPointerMain ) != 0  )
  if( colPointer == colPointerMain && rowPointer + processID * A->getNumRows() != colPointerMain && rowPointer < A->getNumRows() 
          && colPointer < A->getNumColumns() && mainRow[ 0 ] != 0 && A->getElement( rowPointer, colPointerMain ) != 0  )
  {
    b[ rowPointer ] = b[ rowPointer ] - A->getElement( rowPointer, colPointerMain ) * mainRow[ mainRow.getSize()-1 ] / mainRow[ 0 ];
  }
+20 −4
Original line number Diff line number Diff line
#!/bin/bash
device="both"
precision="float"
matice="matice6 662_bus"
matice="662_bus comsol losmoc msc01050"
loops=2

runTest()
@@ -15,10 +14,27 @@ runTest()

	echo ${matrix}.mtx
	./tnl-gem-cuda --input-matrix ${matrix}".mtx" \
			--device ${device} \
			--device GPU \
		        --loops ${loops} \
			--precision ${precision} \
			>> ./results/${matrix}.txt
			>> ./results/${matrix}GPU.txt
	./tnl-gem-cuda --input-matrix ${matrix}".mtx" \
			--device GPU \
		        --loops ${loops} \
			--precision ${precision} \
			--pivoting no \
			>> ./results/${matrix}GPU.txt
	./tnl-gem --input-matrix ${matrix}".mtx" \
			--device CPU \
		        --loops ${loops} \
			--precision ${precision} \
			>> ./results/${matrix}CPU.txt
	./tnl-gem --input-matrix ${matrix}".mtx" \
			--device CPU \
		        --loops ${loops} \
			--precision ${precision} \
			--pivoting no \
			>> ./results/${matrix}CPU.txt
   done
}