Commit 032d4ec0 authored by Illia Kolesnik's avatar Illia Kolesnik Committed by Tomáš Oberhuber
Browse files

Moved changes from matrices-csr to IK/matrices-csr

parent 323b4db2
Loading
Loading
Loading
Loading
+19 −1
Original line number Diff line number Diff line
@@ -231,6 +231,24 @@ public:
   void vectorProductCuda( const InVector& inVector,
                           OutVector& outVector,
                           int gridIdx ) const;
   
   template< typename InVector,
             typename OutVector,
             int warpSize > 
   __device__
   void spmvCudaLightSpmv( const InVector& inVector,
                            OutVector& outVector,
                            int gridIdx) const;
   
   template< typename InVector,
             typename OutVector,
             int warpSize > 
   __device__
   void spmvCudaVector( const InVector& inVector,
                        OutVector& outVector,
                        int gridIdx) const;

   
#endif

   // The following getters allow us to interface TNL with external C-like
+108 −0
Original line number Diff line number Diff line
@@ -671,6 +671,111 @@ Index CSR< Real, Device, Index >::getHybridModeSplit() const
}

#ifdef HAVE_CUDA

template< typename Real,
          typename Device,
          typename Index >
   template< typename InVector,
             typename OutVector,
             int warpSize >
__device__
void CSR< Real, Device, Index >::spmvCudaLightSpmv( const InVector& inVector,
                                                      OutVector& outVector,
                                                      int gridIdx) const
{
  const IndexType index = blockIdx.x * blockDim.x + threadIdx.x;
  const IndexType elemPerThread   = 4;
  const IndexType laneID      = index % 32;
  const IndexType groupID     = laneID / elemPerThread;
  const IndexType inGroupID   = laneID % elemPerThread;

  IndexType row, minID, column, maxID, idxMtx;
  __shared__ unsigned rowCnt;

  if (index == 0) rowCnt = 0;  // Init shared variable
  __syncthreads();

  while (true) {

    /* Get row number */
    if (inGroupID == 0) row = atomicAdd(&rowCnt, 1);

    /* Propagate row number in group */
    row = __shfl_sync((unsigned)(warpSize - 1), row, groupID * elemPerThread);

    if (row >= this->rowPointers.getSize() - 1)
      return;

    minID = this->rowPointers[row];
    maxID = this->rowPointers[row + 1];

    Real result = 0.0;

    idxMtx = minID + inGroupID;
    while (idxMtx < maxID) {
      column = this->columnIndexes[idxMtx];
      if (column >= this->getColumns()) {
        break;
      }
      result += this->values[idxMtx] * inVector[column];
      idxMtx += elemPerThread;
    }

    /* Parallel reduction */
    for (int i = elemPerThread/2; i > 0; i /= 2)
      result += __shfl_down_sync((unsigned)(warpSize - 1), result, i);
    /* Write result */
    if (inGroupID == 0) {
      outVector[row] = result;
    }
  }
}

template< typename Real,
          typename Device,
          typename Index >
   template< typename InVector,
             typename OutVector,
             int warpSize >
__device__
void CSR< Real, Device, Index >::spmvCudaVector( const InVector& inVector,
                                                   OutVector& outVector,
                                                   int gridIdx) const
{
  const IndexType index = blockIdx.x * blockDim.x + threadIdx.x;
  const IndexType laneID = index % warpSize;
  const IndexType row    = index / warpSize; // warpID - one warp per row
  volatile Real result = 0.0;
  
  if (row >= this->rowPointers.getSize() - 1) 
    return;
  
  IndexType minID = this->rowPointers[row];
  IndexType maxID = this->rowPointers[row + 1];
  
  /* Calculate results */  
  for (IndexType idxMtx = minID + laneID; idxMtx < maxID; idxMtx += warpSize) {
    IndexType column = this->columnIndexes[idxMtx];
    if (column >= this->getColumns()) {
      break;
    }
   //  printf("%lf %lf %lf\n", (double)this->values[idxMtx], (double)inVector[column], (double)(this->values[idxMtx] * inVector[column]));
    result += this->values[idxMtx] * inVector[column];
  }
  // printf("=============== lane %d warp %d res %lf\n", (int)laneID, (int)row, (double)result);
  /* Parallel reduction */
  result += __shfl_down_sync((warpSize - 1), result, 16);
  result += __shfl_down_sync((warpSize - 1), result, 8);
  result += __shfl_down_sync((warpSize - 1), result, 4);
  result += __shfl_down_sync((warpSize - 1), result, 2);
  result += __shfl_down_sync((warpSize - 1), result, 1);

  /* Write result */
  if (laneID == 0) {
    outVector[row] = result;
  }
}

template< typename Real,
          typename Device,
          typename Index >
@@ -724,6 +829,9 @@ void CSR< Real, Device, Index >::vectorProductCuda( const InVector& inVector,
                                                             OutVector& outVector,
                                                             int gridIdx ) const
{
   spmvCudaLightSpmv< InVector, OutVector, warpSize >( inVector, outVector, gridIdx );
   return;

   IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
   const IndexType warpStart = warpSize * ( globalIdx / warpSize );
   const IndexType warpEnd = min( warpStart + warpSize, this->getRows() );
+258 −0
Original line number Diff line number Diff line
@@ -1220,6 +1220,264 @@ void test_VectorProduct()
    EXPECT_EQ( outVector_5.getElement( 7 ), 520 );
}

template< typename Matrix >
void test_VectorProductLarger()
{
  using RealType = typename Matrix::RealType;
  using DeviceType = typename Matrix::DeviceType;
  using IndexType = typename Matrix::IndexType;
    
/*
 * Sets up the following 20x20 sparse matrix:
 *
 *          0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19
 *         ------------------------------------------------------------------------------ 
 *   0   /  1   0   0   0   0   0   0   0   0   0   7   7   7   7   7   7   7   7   7   7 \
 *   1   |  0   2   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 |
 *   2   |  0   0   3   3   3   0   0   0   0   0   0   0   0   0   0  -2   0   0   0   0 |
 *   3   |  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 |
 *   4   |  0  10   0  11   0  12   0   0   0   0   0   0   0   0   0   0   0 -48 -49 -50 |
 *   5   |  4   0   0   0   0   4   0   0   0   0   4   0   0   0   0   4   0   0   0   0 |
 *   6   |  0   5   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 |
 *   7   |  0   0   6   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  -6 |
 *   8   |  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 |
 *   9   |  1   2   3   4   5   6   7   8   9   0   0  -1  -2  -3  -4  -5  -6  -7  -8  -9 |
 *  10   |  7   0   0   0   3   3   3   3   0   0   0   0   0   0   0  -6   0   0   0   0 |
 *  11   | -1  -2  -3  -4  -5  -6  -7  -8  -9 -10 -11 -12 -13 -14 -15 -16 -17 -18 -19 -20 |
 *  12   |  0   9   0   9   0   9   0   9   0   9   0   9   0   9   0   9   0   9   0   9 |
 *  13   |  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 |
 *  14   | -1  -2  -3  -4  -5  -6  -7  -8  -9 -10 -11 -12 -13 -14 -15 -16 -17 -18 -19 -20 |
 *  15   |  4   0   4   0   4   0   4   0   4   0   4   0   4   0   4   0   4   0   4   0 |
 *  16   |  0  -2   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  -2   0 |
 *  17   | -1  -2  -3  -4  -5  -6  -7  -8  -9 -10 -11 -12 -13 -14 -15 -16 -17 -18 -19 -20 |
 *  18   |  0   7   0   7   0   7   0   7   0   7   0   7   0   7   0   7   0   7   0   7 |
 *  19   \  8   8   8   0   0   8   8   8   0   0   8   8   8   0   0   8   8   8   0   0 /
 */
  const IndexType m_rows = 20;
  const IndexType m_cols = 20;
  
  Matrix m;
  m.reset();
  m.setDimensions( m_rows, m_cols );
  typename Matrix::CompressedRowLengthsVector rowLengths;
  rowLengths.setSize( m_rows );
  rowLengths.setValue( 20 );
  m.setCompressedRowLengths( rowLengths );
  
  /* 0 */
  m.setElement( 0, 0, 1 );
  for ( IndexType i = 10; i < m_cols; ++i )
      m.setElement( 0, i, 7 );

  /* 1 */
  m.setElement( 1, 1, 2 );
  
  /* 2 */
  m.setElement( 2, 2, 3 );
  m.setElement( 2, 3, 3 );
  m.setElement( 2, 4, 3 );
  m.setElement( 2, 15, -2 );

  /* 4 */
  m.setElement( 4, 1, 10 );
  m.setElement( 4, 3, 11 );
  m.setElement( 4, 5, 12 );
  m.setElement( 4, 17, -48 );
  m.setElement( 4, 18, -49 );
  m.setElement( 4, 19, -50 );

  /* 5 */
  for ( IndexType i = 0; i < m_cols; i += 5 )
      m.setElement( 5, i, 4 );

  /* 6 */
  m.setElement( 6, 1, 5 );

  /* 7 */
  m.setElement( 7, 2, 6 );
  m.setElement( 7, 19, -6 );

  /* 8 */
  for ( IndexType i = 1; i <= m_cols; ++i )
      m.setElement( 8, i - 1, i );

  /* 9 */
  for ( IndexType i = 1; i <= 9; ++i )
      m.setElement( 9, i - 1, i );

  for ( IndexType i = 11; i <= 19; ++i )
      m.setElement( 9, i, -(i - 10) );

  /* 10 */
  m.setElement( 10, 0, 7 );
  m.setElement( 10, 4, 3 );
  m.setElement( 10, 5, 3 );
  m.setElement( 10, 6, 3 );
  m.setElement( 10, 7, 3 );
  m.setElement( 10, 15, -6 );

  /* 11 && 14 && 17 */
  for (IndexType i = 1; i <= m_cols; ++i) {
      m.setElement(11, i - 1, -i);
      m.setElement(14, i - 1, -i);
      m.setElement(17, i - 1, -i);
  }
  
  /* 12 && 18 */
  for (IndexType i = 0; i < m_cols; ++i) {
      if( i % 2 ) {
          m.setElement(12, i, 9);
          m.setElement(18, i, 7);
      }
  }

  /* 15 */
  for (IndexType i = 0; i < m_cols; ++i)
    if (i % 2 == 0) 
      m.setElement(15, i, 4);

  /* 16 */
  m.setElement(16, 1, -2);
  m.setElement(16, 18, -2);

  /* 19 */
  for (IndexType i = 0; i < m_cols; i += 5 ) {
      m.setElement(19, i, 8);
      m.setElement(19, i + 1, 8);
      m.setElement(19, i + 2, 8);
  }

  using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >;
  
  VectorType inVector;
  inVector.setSize( 20 );
  for( IndexType i = 0; i < inVector.getSize(); i++ )        
      inVector.setElement( i, 2 );

  VectorType outVector;  
  outVector.setSize( 20 );
  for( IndexType j = 0; j < outVector.getSize(); j++ )
      outVector.setElement( j, 0 );

  m.vectorProduct( inVector, outVector);
  
  EXPECT_EQ( outVector.getElement( 0 ), 142 );
  EXPECT_EQ( outVector.getElement( 1 ), 4 );
  EXPECT_EQ( outVector.getElement( 2 ), 14 );
  EXPECT_EQ( outVector.getElement( 3 ), 0 );
  EXPECT_EQ( outVector.getElement( 4 ), -228 );
  EXPECT_EQ( outVector.getElement( 5 ), 32 );
  EXPECT_EQ( outVector.getElement( 6 ), 10 );
  EXPECT_EQ( outVector.getElement( 7 ), 0 );
  EXPECT_EQ( outVector.getElement( 8 ), 420 );
  EXPECT_EQ( outVector.getElement( 9 ), 0 );
  EXPECT_EQ( outVector.getElement( 10 ), 26 );
  EXPECT_EQ( outVector.getElement( 11 ), -420 );
  EXPECT_EQ( outVector.getElement( 12 ), 180 );
  EXPECT_EQ( outVector.getElement( 13 ), 0 );
  EXPECT_EQ( outVector.getElement( 14 ), -420 );
  EXPECT_EQ( outVector.getElement( 15 ), 80 );
  EXPECT_EQ( outVector.getElement( 16 ), -8 );
  EXPECT_EQ( outVector.getElement( 17 ), -420 );
  EXPECT_EQ( outVector.getElement( 18 ), 140 );
  EXPECT_EQ( outVector.getElement( 19 ), 192 );
}

template< typename Matrix >
void test_VectorProductGiant()
{
  using RealType = typename Matrix::RealType;
  using DeviceType = typename Matrix::DeviceType;
  using IndexType = typename Matrix::IndexType;
  const IndexType N = 200; // Should be a multiple of 10
/*
 * Sets up the following NxN sparse matrix:
 *
 *    /  1   0   1   0 ... 1  0 \
 *    |  0   0   0   0 ... 0  0 |
 *    |  0   2   0   2 ... 0  2 |
 *    |  0   0   0   0 ... 0  0 |
 *    |  3  -3   3  -3 ... 3 -3 |
 *    |  0   0   0   0 ... 0  0 |
 *    |  5   5   5   5 ... 5  5 |
 *    |  0   0   0   0 ... 0  0 |
 *    |  0   0   0   0 ... 0  0 |
 *    |  2   0   0   0 ... 0  0 | <- the only 2 in the row
 *    
 *       ..................
 */
    
  const IndexType m_rows = N;
  const IndexType m_cols = N;
  
  Matrix m;
  m.reset();
  m.setDimensions( m_rows, m_cols );
  typename Matrix::CompressedRowLengthsVector rowLengths;
  rowLengths.setSize( m_rows );
  rowLengths.setValue( m_cols );
  m.setCompressedRowLengths( rowLengths );
  
  for ( IndexType row = 0; row < m_rows; ++row ) {
      if (row % 10 == 0) {
          for ( IndexType col = 0; col < m_cols; ++col ) {
              if (col % 2)
                  m.setElement( row, col, 1 );
          }
      }
      else if (row % 10 == 2) {
          for ( IndexType col = 0; col < m_cols; ++col ) {
              if (col % 2 == 1)
                  m.setElement( row, col, 2 );
          }
      }
      else if (row % 10 == 4) {
          for ( IndexType col = 0; col < m_cols; ++col ) {
              if (col % 2)
                  m.setElement( row, col, 3 );
              else
                  m.setElement( row, col, -3 );
          }
      }
      else if (row % 10 == 6) {
          for ( IndexType col = 0; col < m_cols; ++col ) {
              m.setElement( row, col, 5 );
          }
      }
      else if (row % 10 == 9) {
          m.setElement( row, 0, 2 );
      }
  }

  using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >;
  
  VectorType inVector;
  inVector.setSize( N );
  for( IndexType i = 0; i < inVector.getSize(); i++ )        
      inVector.setElement( i, 2 );

  VectorType outVector;  
  outVector.setSize( N );
  for( IndexType j = 0; j < outVector.getSize(); j++ )
      outVector.setElement( j, 0 );

  
  m.vectorProduct( inVector, outVector);
  
  for ( IndexType i = 0; i < outVector.getSize(); ++i ) {
      if ( i % 10 == 0 )
          EXPECT_EQ( outVector.getElement( i ), 200 /* (N) */ );
      else if ( i % 10 == 2 )
          EXPECT_EQ( outVector.getElement( i ), 400 /* (2 * N) */ );
      else if ( i % 10 == 6 )
          EXPECT_EQ( outVector.getElement( i ), 2000 /* (10 * N) */ );
      else if ( i % 10 == 9 )
          EXPECT_EQ( outVector.getElement( i ), 4 );
      else 
          EXPECT_EQ( outVector.getElement( i ), 0 );
  }
}

template< typename Matrix >
void test_RowsReduction()
{
+14 −0
Original line number Diff line number Diff line
@@ -112,6 +112,20 @@ TYPED_TEST( CSRMatrixTest, vectorProductTest )
    test_VectorProduct< CSRMatrixType >();
}

TYPED_TEST( CSRMatrixTest, vectorProductLargerTest )
{
    using CSRMatrixType = typename TestFixture::CSRMatrixType;

    test_VectorProductLarger< CSRMatrixType >();
}

TYPED_TEST( CSRMatrixTest, vectorProductGiantTest )
{
    using CSRMatrixType = typename TestFixture::CSRMatrixType;

    test_VectorProductGiant< CSRMatrixType >();
}

TYPED_TEST( CSRMatrixTest, saveAndLoadTest )
{
    using CSRMatrixType = typename TestFixture::CSRMatrixType;