Commit f620b0ee authored by Illia Kolesnik's avatar Illia Kolesnik Committed by Tomáš Oberhuber
Browse files

Added CSR Adaptive and CSR Stream

parent f9ace650
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -230,7 +230,7 @@ public:
   __device__
   void vectorProductCuda( const InVector& inVector,
                           OutVector& outVector,
                           int gridIdx, unsigned *blocks, size_t size ) const;
                           int gridIdx, int *blocks, size_t size ) const;
   
   template< typename InVector,
             typename OutVector,
@@ -247,7 +247,7 @@ public:
   void spmvCSRAdaptive( const InVector& inVector,
                           OutVector& outVector,
                           int gridIdx,
                           unsigned *blocks,
                           int *blocks,
                           size_t blocks_size) const;
#endif

+161 −116
Original line number Diff line number Diff line
@@ -732,6 +732,35 @@ void CSR< Real, Device, Index >::spmvCudaLightSpmv( const InVector& inVector,
   }
}

/* template< typename Real,
          typename Device,
          typename Index,
          typename InVector,
          int warpSize >
__global__
void spmvCSRVectorHelper( const InVector& inVector,
                          Real *out,
                          size_t from,
                          size_t to,
                          size_t perWarp)
{
   const size_t index  = blockIdx.x * blockDim.x + threadIdx.x;
   const size_t warpID = index / warpSize;
   const size_t laneID = index % warpSize;
   const size_t minID  = from + warpID * perWarp;
   size_t maxID  = from + (warpID + 1) * perWarp;
   if (minID >= to)  return;
   if (maxID >= to ) maxID = to;
   
   Real result = 0.0;
   for (IndexType i = minID + laneID; i < maxID; i += warpSize) {
      const IndexType column = this->columnIndexes[i];
      if (column >= this->getColumns())
            continue;
      result += this->values[i] * inVector[column];
   }
   atomicAdd(out, result);
} */

template< typename Real,
          typename Device,
@@ -743,59 +772,58 @@ __device__
void CSR< Real, Device, Index >::spmvCSRAdaptive( const InVector& inVector,
                                                      OutVector& outVector,
                                                      int gridIdx,
                                                      unsigned *blocks,
                                                      int *blocks,
                                                      size_t blocks_size) const
{
   constexpr IndexType SHARED = 1024; /* TODO: delete */
   /* Configuration ---------------------------------------------------*/
   constexpr size_t SHARED = 49152/sizeof(float);
   constexpr size_t SHARED_PER_WARP = SHARED / warpSize;
   constexpr size_t MAX_PER_WARP = 65536;
   constexpr size_t ELEMENTS_PER_WARP = 1024;
   constexpr size_t THREADS_PER_BLOCK = 1024;
   constexpr size_t WARPS_PER_BLOCK = THREADS_PER_BLOCK / warpSize;
   //--------------------------------------------------------------------
   const IndexType index = blockIdx.x * blockDim.x + threadIdx.x;
   __shared__ unsigned blockCnt;
   __shared__ float shared_res[SHARED];
   float result = 0.0;
   if (index == 0) blockCnt = 0;  // Init shared variable
   __syncthreads();
   
   const IndexType laneID = index % warpSize;
   IndexType blockIdx;

   while (true) {
      /* Get row number */
      if (laneID == 0) blockIdx = atomicAdd(&blockCnt, 1);
      /* Propagate row number in warp */
      blockIdx = __shfl_sync((unsigned)(warpSize - 1), blockIdx, 0);
   IndexType blockIdx = index / warpSize;
   __shared__ float shared_res[SHARED];
   Real result = 0.0;
   if (blockIdx >= blocks_size - 1)
      return;
      /* Number of elements */
   const IndexType minRow = blocks[blockIdx];
   const IndexType maxRow = blocks[blockIdx + 1];
   const IndexType minID = this->rowPointers[minRow];
   const IndexType maxID = this->rowPointers[maxRow];
   const IndexType elements = maxID - minID;
      result = 0.0;
   /* rows per block more than 1 */
   if ((maxRow - minRow) > 1) {
      /////////////////////////////////////* CSR STREAM *//////////////
      /* Copy and calculate elements from global to shared memory, coalesced */
      const IndexType offset = threadIdx.x / warpSize * SHARED_PER_WARP;
      for (IndexType i = laneID; i < elements; i += warpSize) {
         const IndexType elementIdx = i + minID;
         const IndexType column = this->columnIndexes[elementIdx];
         if (column >= this->getColumns())
               break;
            shared_res[i] = this->values[elementIdx] * inVector[column];
            continue;
         shared_res[i + offset] = this->values[elementIdx] * inVector[column];
      }

      const IndexType row = minRow + laneID;
         if (row >= maxRow - 1)
            continue;
      if (row >= maxRow)
         return;
      /* Calculate result */
      const IndexType to = this->rowPointers[row + 1] - minID;
      for (IndexType i = this->rowPointers[row] - minID; i < to; ++i) {
            result += shared_res[i];
         result += shared_res[i + offset];
      }
      outVector[row] = result; // Write result
      } else {
   } else if (elements <= MAX_PER_WARP) {
      /////////////////////////////////////* CSR VECTOR *//////////////
      for (IndexType i = minID + laneID; i < maxID; i += warpSize) {
            const IndexType column = this->columnIndexes[i];
         IndexType column = this->columnIndexes[i];
         if (column >= this->getColumns())
            break;

         result += this->values[i] * inVector[column];
      }
      /* Reduction */
@@ -805,7 +833,18 @@ void CSR< Real, Device, Index >::spmvCSRAdaptive( const InVector& inVector,
      result += __shfl_down_sync((unsigned)(warpSize - 1), result, 2);
      result += __shfl_down_sync((unsigned)(warpSize - 1), result, 1);
      if (laneID == 0) outVector[minRow] = result; // Write result
      }
   } else {
      /////////////////////////////////////* CSR VECTOR LONG *//////////////
      const size_t warps = (elements - ELEMENTS_PER_WARP) / ELEMENTS_PER_WARP + 1;
      const size_t blocks = warps <= WARPS_PER_BLOCK ? 1 : warps / WARPS_PER_BLOCK + 1;
      const size_t threads_per_block = blocks == 1 ? warps * warpSize : WARPS_PER_BLOCK * warpSize;
      // spmvCSRVectorHelper<InVector, warpSize> <<<blocks, threads_per_block>>>(
      //             inVector,
      //             &outVector[minRow],
      //             (size_t)(minID + ELEMENTS_PER_WARP),
      //             (size_t)maxID,
      //             (size_t)ELEMENTS_PER_WARP
      // );
   }
}

@@ -862,7 +901,7 @@ __device__
void CSR< Real, Device, Index >::vectorProductCuda( const InVector& inVector,
                                                             OutVector& outVector,
                                                             int gridIdx,
                                                             unsigned *blocks, size_t size ) const
                                                             int *blocks, size_t size ) const
{
   spmvCSRAdaptive< InVector, OutVector, warpSize >( inVector, outVector, gridIdx, blocks, size );
   return;
@@ -939,7 +978,7 @@ __global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Inde
                                                     const InVector* inVector,
                                                     OutVector* outVector,
                                                     int gridIdx,
                                                     unsigned *blocks, size_t size)
                                                     int *blocks, size_t size)
{
   typedef CSR< Real, Devices::Cuda, Index > Matrix;
   static_assert( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value, "" );
@@ -965,7 +1004,7 @@ template< typename Real,
void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix,
                                    const InVector& inVector,
                                    OutVector& outVector,
                                    unsigned *blocks,
                                    int *blocks,
                                    size_t size )
{
#ifdef HAVE_CUDA
@@ -974,7 +1013,10 @@ void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix,
   Matrix* kernel_this = Cuda::passToDevice( matrix );
   InVector* kernel_inVector = Cuda::passToDevice( inVector );
   OutVector* kernel_outVector = Cuda::passToDevice( outVector );
   unsigned *kernelBlocks = Cuda::passToDevice( *blocks );
   int *kernelBlocks;
   cudaMalloc((void **)&kernelBlocks, sizeof(int) * size);
   cudaMemcpy(kernelBlocks, blocks, size * sizeof(int), cudaMemcpyHostToDevice);

   TNL_CHECK_CUDA_DEVICE;
   dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
   const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
@@ -984,48 +1026,51 @@ void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix,
      if( gridIdx == cudaGrids - 1 )
         cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
      const int sharedMemory = cudaBlockSize.x * sizeof( Real );
      if( matrix.getCudaWarpSize() == 32 )
      // const int threads = cudaBlockSize.x;
      if( matrix.getCudaWarpSize() == 32 ) {
         // printf("BL %d BLSIZE %d\n", (int)cudaBlocks, (int)threads);
         CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 32 >
                                            <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
                                            ( kernel_this,
                                              kernel_inVector,
                                              kernel_outVector,
                                              gridIdx, kernelBlocks, size );
      if( matrix.getCudaWarpSize() == 16 )
         CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 16 >
                                            <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
                                            ( kernel_this,
                                              kernel_inVector,
                                              kernel_outVector,
                                              gridIdx, kernelBlocks, size );
      if( matrix.getCudaWarpSize() == 8 )
         CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 8 >
                                            <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
                                            ( kernel_this,
                                              kernel_inVector,
                                              kernel_outVector,
                                              gridIdx, kernelBlocks, size );
      if( matrix.getCudaWarpSize() == 4 )
         CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 4 >
                                            <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
                                            ( kernel_this,
                                              kernel_inVector,
                                              kernel_outVector,
                                              gridIdx, kernelBlocks, size );
      if( matrix.getCudaWarpSize() == 2 )
         CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 2 >
                                            <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
                                            ( kernel_this,
                                              kernel_inVector,
                                              kernel_outVector,
                                              gridIdx, kernelBlocks, size );
      if( matrix.getCudaWarpSize() == 1 )
         CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 1 >
                                            <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
                                            <<< 2, 1024 >>>
                                            ( kernel_this,
                                              kernel_inVector,
                                              kernel_outVector,
                                              gridIdx, kernelBlocks, size );
      }
      // if( matrix.getCudaWarpSize() == 16 )
      //    CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 16 >
      //                                       <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
      //                                       ( kernel_this,
      //                                         kernel_inVector,
      //                                         kernel_outVector,
      //                                         gridIdx, kernelBlocks, size );
      // if( matrix.getCudaWarpSize() == 8 )
      //    CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 8 >
      //                                       <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
      //                                       ( kernel_this,
      //                                         kernel_inVector,
      //                                         kernel_outVector,
      //                                         gridIdx, kernelBlocks, size );
      // if( matrix.getCudaWarpSize() == 4 )
      //    CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 4 >
      //                                       <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
      //                                       ( kernel_this,
      //                                         kernel_inVector,
      //                                         kernel_outVector,
      //                                         gridIdx, kernelBlocks, size );
      // if( matrix.getCudaWarpSize() == 2 )
      //    CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 2 >
      //                                       <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
      //                                       ( kernel_this,
      //                                         kernel_inVector,
      //                                         kernel_outVector,
      //                                         gridIdx, kernelBlocks, size );
      // if( matrix.getCudaWarpSize() == 1 )
      //    CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 1 >
      //                                       <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
      //                                       ( kernel_this,
      //                                         kernel_inVector,
      //                                         kernel_outVector,
      //                                         gridIdx, kernelBlocks, size );

   }
   TNL_CHECK_CUDA_DEVICE;
@@ -1145,32 +1190,32 @@ class CSRDeviceDependentCode< Devices::Cuda >
                                                              inVector.getData(),
                                                              outVector.getData() );
#else
         std::vector<unsigned> inBlock;
         constexpr int SHARED = 49152/sizeof(float);
         constexpr int SHARED_PER_WARP = SHARED / 32;
         std::vector<int> inBlock;
         inBlock.push_back(0);
         unsigned sum = 0;
         for (size_t i = 1; i < matrix.getRowPointers().getSize(); ++i) {
            printf("PTR %d\n", (int)matrix.getRowPointers().getElement(i));
         }
         for (size_t i = 1; i < matrix.getRowPointers().getSize() - 1; ++i) {
            size_t elements = matrix.getRowPointers().getElement(i + 1) -
                                 matrix.getRowPointers().getElement(i);
            if (elements > 1024) {
               if (sum == 0) {
         size_t sum = 0;
         size_t i;
         int prev_i = 0;
         for (i = 1; i < matrix.getRowPointers().getSize() - 1; ++i) {
            size_t elements = matrix.getRowPointers().getElement(i) -
                                 matrix.getRowPointers().getElement(i - 1);
            sum += elements;
            if (sum > SHARED_PER_WARP) {
               if (i - prev_i == 1) {
                  inBlock.push_back(i);
               }
               else {
               } else {
                  inBlock.push_back(i - 1);
                  inBlock.push_back(i);
                  --i;
               }
               sum = 0;
            }
            sum += elements;
            
            if (sum <= 1024)
               prev_i = i;
               continue;
            else {
               inBlock.push_back(i - 1);
               sum = elements;
            }
            if (i - prev_i == 32) {
               inBlock.push_back(i);
               prev_i = i;
               sum = 0;
            }
         }
         inBlock.push_back(matrix.getRowPointers().getSize() - 1);
+66 −74
Original line number Diff line number Diff line
@@ -1259,9 +1259,11 @@ void test_VectorProductLarger()
  Matrix m;
  m.reset();
  m.setDimensions( m_rows, m_cols );
  typename Matrix::CompressedRowLengthsVector rowLengths;
  rowLengths.setSize( m_rows );
  rowLengths.setValue( 20 );
  typename Matrix::CompressedRowLengthsVector rowLengths(
     {11, 2, 4, 0, 6, 4, 1, 2, 20, 18, 6, 20, 10, 0, 20, 10, 2, 20, 10, 12}
  );
//   rowLengths.setSize( m_rows );
//   rowLengths.setValue( 20 );
  m.setCompressedRowLengths( rowLengths );
  
  /* 0 */
@@ -1389,93 +1391,83 @@ void test_VectorProductGiant()
  using RealType = typename Matrix::RealType;
  using DeviceType = typename Matrix::DeviceType;
  using IndexType = typename Matrix::IndexType;
  const IndexType N = 200; // Should be a multiple of 10
/*
 * Sets up the following NxN sparse matrix:
 *
 *    /  1   0   1   0 ... 1  0 \
 *    |  0   0   0   0 ... 0  0 |
 *    |  0   2   0   2 ... 0  2 |
 *    |  0   0   0   0 ... 0  0 |
 *    |  3  -3   3  -3 ... 3 -3 |
 *    |  0   0   0   0 ... 0  0 |
 *    |  5   5   5   5 ... 5  5 |
 *    |  0   0   0   0 ... 0  0 |
 *    |  0   0   0   0 ... 0  0 |
 *    |  2   0   0   0 ... 0  0 | <- the only 2 in the row
 *    
 *       ..................
 */
    
  const IndexType m_rows = N;
  const IndexType m_cols = N;
  IndexType m_rows = 100;
  IndexType m_cols = 100;
  
  Matrix m;
  m.reset();
  m.setDimensions( m_rows, m_cols );
  typename Matrix::CompressedRowLengthsVector rowLengths;
  rowLengths.setSize( m_rows );
  rowLengths.setValue( m_cols );
  typename Matrix::CompressedRowLengthsVector rowLengths(
     {
        100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
        100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
        100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
        100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
        100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
        100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
        100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
        100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
        100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
        100, 100, 100, 100, 100, 100, 100, 100, 100, 100
     }
  );

  m.setCompressedRowLengths( rowLengths );
  
  for ( IndexType row = 0; row < m_rows; ++row ) {
      if (row % 10 == 0) {
          for ( IndexType col = 0; col < m_cols; ++col ) {
              if (col % 2)
                  m.setElement( row, col, 1 );
          }
      }
      else if (row % 10 == 2) {
          for ( IndexType col = 0; col < m_cols; ++col ) {
              if (col % 2 == 1)
                  m.setElement( row, col, 2 );
          }
      }
      else if (row % 10 == 4) {
          for ( IndexType col = 0; col < m_cols; ++col ) {
              if (col % 2)
                  m.setElement( row, col, 3 );
              else
                  m.setElement( row, col, -3 );
          }
      }
      else if (row % 10 == 6) {
          for ( IndexType col = 0; col < m_cols; ++col ) {
              m.setElement( row, col, 5 );
          }
      }
      else if (row % 10 == 9) {
          m.setElement( row, 0, 2 );
      }
  }
  for (int i = 0; i < m_rows; ++i)
     for (int j = 0; j < m_cols; ++j) 
         m.setElement( i, j, i + 1 );

  using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >;
  
  VectorType inVector;
  inVector.setSize( N );
  for( IndexType i = 0; i < inVector.getSize(); i++ )        
      inVector.setElement( i, 2 );
  inVector.setSize( m_rows );
  for( IndexType i = 0; i < inVector.getSize(); ++i )        
      inVector.setElement( i, 1 );

  VectorType outVector;  
  outVector.setSize( N );
  for( IndexType j = 0; j < outVector.getSize(); j++ )
      outVector.setElement( j, 0 );

  outVector.setSize( m_rows );
  for( IndexType i = 0; i < outVector.getSize(); ++i )
      outVector.setElement( i, 0 );

  m.vectorProduct( inVector, outVector);

  for ( IndexType i = 0; i < outVector.getSize(); ++i ) {
      if ( i % 10 == 0 )
          EXPECT_EQ( outVector.getElement( i ), 200 /* (N) */ );
      else if ( i % 10 == 2 )
          EXPECT_EQ( outVector.getElement( i ), 400 /* (2 * N) */ );
      else if ( i % 10 == 6 )
          EXPECT_EQ( outVector.getElement( i ), 2000 /* (10 * N) */ );
      else if ( i % 10 == 9 )
          EXPECT_EQ( outVector.getElement( i ), 4 );
      else 
          EXPECT_EQ( outVector.getElement( i ), 0 );
  for (int i = 0; i < m_rows; ++i)
   EXPECT_EQ( outVector.getElement( i ), (i + 1) * 100 );

   //-----------------------------------------------------

  m_rows = 2;
  m_cols = 1000;
  
  m.reset();
  m.setDimensions( m_rows, m_cols );
  typename Matrix::CompressedRowLengthsVector rowLengths2(
     {
        1000, 1000
     }
  );

  m.setCompressedRowLengths( rowLengths2 );
  
  for (int i = 0; i < m_rows; ++i)
     for (int j = 0; j < m_cols; ++j) 
         m.setElement( i, j, i + 1 );

  VectorType inVector2;
  inVector2.setSize( m_cols );
  for( IndexType i = 0; i < inVector2.getSize(); i++ )
      inVector2.setElement( i, 1 );

  VectorType outVector2;  
  outVector2.setSize( m_rows );
  for( IndexType i = 0; i < outVector2.getSize(); ++i )
      outVector2.setElement( i, 0 );
  m.vectorProduct( inVector2, outVector2);

  for (int i = 0; i < m_rows; ++i)
   EXPECT_EQ( outVector2.getElement( i ), (i + 1) * 1000 );
}

template< typename Matrix >