Commit a03b09c1 authored by Matouš Fencl's avatar Matouš Fencl Committed by Jakub Klinkovský
Browse files

Parallel reduction repaired, optimalized for pivoting GPU->CPU copy. Text: seq GEM, paral GEM

parent fd737b29
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -19,7 +19,7 @@ template < typename Real,
        typename Index >
void Matrix< Real, Device, Index >::setDimensions( const Index rows, const Index columns )
{ 
  TNL_ASSERT( rows > 0 && columns > 0, std::cerr << "Matrix cannot have zero rows nor columns!");
  TNL_ASSERT( rows > 0 && columns > 0, std::cerr << "Matrix cannot have zero rows nor columns!\n");
  this->rows = rows; 
  this->columns = columns;
  
+3 −3
Original line number Diff line number Diff line
@@ -112,11 +112,11 @@ void readMatrixVector( Matrix< Real, Device, Index>& matrix,
{
  typedef Matrix< Real, Devices::Host, Index> MatrixHost;
  MatrixHost matrixHost;
  // Get matrix
  Matrices::MatrixReader< MatrixHost > m;
  m.readMtxFile( "./test-matrices/" + matrixName, matrixHost );
  if( verbose > 1 )
    cout << "reading matrix " << matrixName << endl;
  // Get matrix
  Matrices::MatrixReader< MatrixHost > m;
  m.readMtxFile( "./test-matrices/" + matrixName, matrixHost, verbose );
  if( verbose > 2 )
    matrixHost.showMatrix();
  matrix = matrixHost;
+1 −1
Original line number Diff line number Diff line
@@ -144,7 +144,7 @@ bool GEM< Real, Device, Index >::solveWithPivoting( Array& x, int verbose )
      if( verbose > 1 )
      {
         std::cout << std::endl;
         std::cout << "Choosing element at " << pivotPosition << "-th row as pivot..." << std::endl;
         std::cout << "Choosing element at " << pivotPosition << "-th row as pivot with value " << this->A.getElement(k,k)<<"..."  << std::endl;
         std::cout << "Swapping " << k << "-th and " << pivotPosition <<  "-th rows ... " << std::endl;
      }
            
+7 −6
Original line number Diff line number Diff line
@@ -57,14 +57,15 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting
    if( pivoting == "yes" )
    {
      // PIVOTING
      int reduceBlockSize = (this->A.getNumColumns()-colPointer) > 512 ? 512 : 
      int reduceBlockSize = (this->A.getNumColumns()-colPointer) > 1024 ? 1024 : 
        TNL::roundToMultiple( this->A.getNumColumns()-colPointer, 32 );  
      int reduceGridSize = TNL::roundUpDivision( this->A.getNumColumns()-colPointer, reduceBlockSize );
      int reduceGridSizeRound = TNL::roundToMultiple( reduceGridSize, 32 );
      //printf("%d, %d, %d\n", reduceBlockSize, reduceGridSize, reduceGridSizeRound );

      TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > outMax(reduceGridSizeRound);
      TNL::Containers::Vector< Index, TNL::Devices::Cuda, Index > outPos(reduceGridSizeRound);
      outMax.setValue(0); outPos.setValue(0);
      TNL::Containers::Vector< Real, TNL::Devices::Cuda, Index > outMax(reduceGridSize);
      TNL::Containers::Vector< Index, TNL::Devices::Cuda, Index > outPos(reduceGridSize);
      //outMax.setValue(0); outPos.setValue(0);

      findPivot<<< reduceGridSize, reduceBlockSize >>>( devMat, colPointer, outMax.getView(), outPos.getView() );
      cudaDeviceSynchronize();
@@ -83,12 +84,12 @@ bool GEM<Real, Device, Index >::GEMdevice( Array& x, const TNL::String& pivoting
    int numOfBlocks =  this->A.getNumRows() * numBlocksOnRow;
    
    
    if( pivoting == "yes" && *pom != -1 && *pom != colPointer )
    if( pivoting == "yes" )// && *pom != -1 && *pom != colPointer )
    {
      if( verbose > 1 )
      {
         std::cout << std::endl;
         std::cout << "Choosing element at " << *pom << "-th row as pivot..." << std::endl;
         std::cout << "Choosing element at " << *pom << "-th row as pivot with value..."  << std::endl;
         std::cout << "Swapping " << colPointer << "-th and " << *pom <<  "-th rows ... " << std::endl;
      }
      swapRows<<< numBlocksOnRow, blockSize >>>( devMat, device_vector.getView(), colPointer, numBlocksOnRow, pivot );
+25 −54
Original line number Diff line number Diff line
@@ -25,18 +25,24 @@ __inline__ __device__ void warpReduceArgMax(Real& val, int& index) {
template <typename Real >
__inline__ __device__ void blockReduceArgMax(Real& val, int& index) 
{
  static __shared__ Real shared[32]; // Shared mem for 32 partial sums
  static __shared__ Real sharedVal[32]; // Shared mem for 32 partial reduction
  static __shared__ Real sharedIndex[32]; // Shared mem for 32 partial reduction
  int lane = threadIdx.x % 32;
  int wid = threadIdx.x / 32;

  warpReduceArgMax( val, index);     // Each warp performs partial reduction

  if (lane==0) shared[wid]=val; // Write reduced value to shared memory
  if (lane==0) 
  {
    sharedVal[wid]=val; // Write reduced value to shared memory
    sharedIndex[wid]=index; // Write reduced value to shared memory
  }

  __syncthreads();              // Wait for all partial reductions

  //read from shared memory only if that warp existed
  val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
  val = (threadIdx.x < blockDim.x / 32) ? sharedVal[lane] : 0;
  index = (threadIdx.x < blockDim.x / 32) ? sharedIndex[lane] : 0;

  if (wid==0)
  {
@@ -63,23 +69,9 @@ void findPivot( Matrix< Real, TNL::Devices::Cuda, int >* A,
  blockReduceArgMax( firstElementInRow, index );
  if( threadIdx.x == 0 )
  {
    //printf("%d: %.2f\n", index, firstElementInRow );
    outMaximum[blockIdx.x] = firstElementInRow;
    outPosition[blockIdx.x] = index;
  }
    
  
  //if( rowPointer < A->getNumRows() && rowPointer >= colPointerMain )
  {
    //int pom = __float_as_int( (float)TNL::abs( A->getElement( rowPointer, colPointerMain ) ) );
    /*if( TNL::abs( A->getElement(rowPointer, colPointerMain ) ) > 1 )
      printf("%d: %d, %.4f\n", rowPointer, pom, TNL::abs( A->getElement(rowPointer, colPointerMain ) ) );*/
    //atomicMax( Maximum, pom );
  } 
  
  
  //if( threadIdx.x == 0 && blockIdx.x == 0 )
  //    printf("%.4f %d\n", firstElementInRow, index );
}


@@ -88,36 +80,14 @@ __global__
void findRowPivot( TNL::Containers::VectorView< Real, TNL::Devices::Cuda, int > outMaximum,
        TNL::Containers::VectorView< int, TNL::Devices::Cuda, int > outPosition, int* positionPivot )
{
  /*if( threadIdx.x == 0 )
  {
    printf("outMax = [ ");
    for( int i = 0; i < blockDim.x; i++ )
      printf("%.2f, ", outMaximum[i] );
    printf("]\n");
    
    printf("outPos = [ ");
    for( int i = 0; i < blockDim.x; i++ )
      printf("%d, ", outPosition[i] );
    printf("]\n");
  }*/
  
  int rowPointer = threadIdx.x;
  Real firstElementInRow = rowPointer >= blockDim.x ? 0 : outMaximum[ rowPointer ];
  int index = outPosition[ rowPointer ];
  Real firstElementInRow = rowPointer >= outMaximum.getSize() ? 0 : outMaximum[ rowPointer ];
  int index = rowPointer >= outPosition.getSize() ? 0 : outPosition[ rowPointer ];
  blockReduceArgMax( firstElementInRow, index );
  //printf("%d: %.2f\n", index, firstElementInRow );
  if( threadIdx.x == 0 )
  {
    *positionPivot = index;
  }
  
  
  /*int rowPointer = threadIdx.x + blockDim.x * (blockIdx.x % numBlocksOnRow) + colPointerMain;
  if( rowPointer >= colPointerMain && rowPointer < A->getNumRows() &&
          __float_as_int( (float)TNL::abs( A->getElement( rowPointer, colPointerMain ) ) ) == *Maximum )
  {
    atomicExch( positionPivot, rowPointer);
  }*/
}


@@ -127,8 +97,8 @@ void swapRows( Matrix< Real, TNL::Devices::Cuda, int >* A,
        TNL::Containers::VectorView< Real, TNL::Devices::Cuda, int > b,
        int colPointerMain, int numBlocksOnRow, int* positionPivot )
{
  //if( threadIdx.x == 0 && blockIdx.x == 0 )
  //  printf("\nChoosing element at %d-th row as pivot...\nSwapping %d-th and %d-th rows...\n",*positionPivot,colPointerMain,*positionPivot );
  if( *positionPivot > colPointerMain )
  {
    int rowPointer1 = colPointerMain;
    int rowPointer2 = *positionPivot;
    int colPointer = threadIdx.x + blockDim.x * (blockIdx.x % numBlocksOnRow) + colPointerMain;
@@ -145,6 +115,7 @@ void swapRows( Matrix< Real, TNL::Devices::Cuda, int >* A,
      }
    }
  }
}


template <typename Real >