Commit 7dcf1383 authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Added MatrixOperations::geam to optimize BiCGstab(l)

parent 3540ffe0
Loading
Loading
Loading
Loading
+165 −0
Original line number Diff line number Diff line
@@ -146,6 +146,86 @@ public:
      }

   }

   /*
    * This function performs the matrix-matrix addition
    *    C = alpha * A + beta * B
    * where:
    *    alpha and beta are scalars,
    *    A, B, C are (m by n) matrices stored in column-major format on Devices::Cuda,
    *    lda, ldb, ldc (all >= m) are the leading dimensions of matrices A, B, C,
    *    respectively.
    *
    * It is assumed that n is much smaller than m.
    */
   template< typename RealType,
             typename IndexType >
   static void
   geam( const IndexType& m,
         const IndexType& n,
         const RealType& alpha,
         const RealType* A,
         const IndexType& lda,
         const RealType& beta,
         const RealType* B,
         const IndexType& ldb,
         RealType* C,
         const IndexType& ldc )
   {
      TNL_ASSERT_GT( m, 0, "m must be positive" );
      TNL_ASSERT_GT( n, 0, "n must be positive" );
      TNL_ASSERT_GE( lda, m, "lda must be at least m" );
      TNL_ASSERT_GE( ldb, m, "lda must be at least m" );
      TNL_ASSERT_GE( ldc, m, "lda must be at least m" );

      if( n == 1 ) {
         #ifdef HAVE_OPENMP
         #pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() )
         #endif
         for( IndexType j = 0; j < m; j++ )
            C[ j ] = alpha * A[ j ] + beta * B[ j ];
      }
      else {
         // all matrices should be accessed column-wise so we split the work into small
         // blocks and each block process by columns, either parallelly or serially
         constexpr int block_size = 128;
         const int blocks = m / block_size;

         #ifdef HAVE_OPENMP
         #pragma omp parallel if( TNL::Devices::Host::isOMPEnabled() && blocks >= 2 )
         #endif
         {
            #ifdef HAVE_OPENMP
            #pragma omp for nowait
            #endif
            for( int b = 0; b < blocks; b++ ) {
               const int block_offset = b * block_size;
               for( IndexType j = 0; j < n; j++ ) {
                  const IndexType offset_A = j * lda + block_offset;
                  const IndexType offset_B = j * ldb + block_offset;
                  const IndexType offset_C = j * ldc + block_offset;
                  for( int i = 0; i < block_size; i++ )
                     C[ offset_C + i ] = alpha * A[ offset_A + i ] + beta * B[ offset_B + i ];
               }
            }

            // the first thread that reaches here processes the last, incomplete block
            #ifdef HAVE_OPENMP
            #pragma omp single nowait
            #endif
            {
               for( IndexType j = 0; j < n; j++ ) {
                  const IndexType offset_A = j * lda;
                  const IndexType offset_B = j * ldb;
                  const IndexType offset_C = j * ldc;
                  for( IndexType i = blocks * block_size; i < m; i++ )
                     C[ offset_C + i ] = alpha * A[ offset_A + i ] + beta * B[ offset_B + i ];
               }
            }
         }
      }

   }
};


@@ -199,6 +279,34 @@ GemvCudaKernel( const IndexType m,
      }
   }
}

template< typename RealType,
          typename IndexType >
__global__ void
GeamCudaKernel( const IndexType m,
                const IndexType n,
                const RealType alpha,
                const RealType* A,
                const IndexType lda,
                const RealType beta,
                const RealType* B,
                const IndexType ldb,
                RealType* C,
                const IndexType ldc )
{
   IndexType x = blockIdx.x * blockDim.x + threadIdx.x;
   const IndexType gridSizeX = blockDim.x * gridDim.x;
   const IndexType y = blockIdx.y * blockDim.y + threadIdx.y;
   const IndexType offset_A = y * lda;
   const IndexType offset_B = y * ldb;
   const IndexType offset_C = y * ldc;

   if( y < n )
      while( x < m ) {
         C[ x + offset_C ] = alpha * A[ x + offset_A ] + beta * B[ x + offset_B ];
         x += gridSizeX;
      }
}
#endif

// specialization for CUDA
@@ -253,6 +361,63 @@ public:
      TNL_CHECK_CUDA_DEVICE;
#else
      throw Exceptions::CudaSupportMissing();
#endif
   }

   /*
    * This function performs the matrix-matrix addition
    *    C = alpha * A + beta * B
    * where:
    *    alpha and beta are scalars,
    *    A, B, C are (m by n) matrices stored in column-major format on Devices::Cuda,
    *    lda, ldb, ldc (all >= m) are the leading dimensions of matrices A, B, C,
    *    respectively.
    *
    * It is assumed that n is much smaller than m.
    */
   template< typename RealType,
             typename IndexType >
   static void
   geam( const IndexType& m,
         const IndexType& n,
         const RealType& alpha,
         const RealType* A,
         const IndexType& lda,
         const RealType& beta,
         const RealType* B,
         const IndexType& ldb,
         RealType* C,
         const IndexType& ldc )
   {
      TNL_ASSERT_GT( m, 0, "m must be positive" );
      TNL_ASSERT_GT( n, 0, "n must be positive" );
      TNL_ASSERT_GE( lda, m, "lda must be at least m" );
      TNL_ASSERT_GE( ldb, m, "lda must be at least m" );
      TNL_ASSERT_GE( ldc, m, "lda must be at least m" );

#ifdef HAVE_CUDA
      dim3 blockSize, gridSize;

      // max 16 columns of threads
      blockSize.y = min( n, 16 );
      // max 256 threads per block, power of 2
      blockSize.x = 256;
      while( blockSize.x * blockSize.y > 256 )
         blockSize.x /= 2;

      const IndexType desGridSize = Gemv_minBlocksPerMultiprocessor
                                  * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() );
      gridSize.x = min( desGridSize, Devices::Cuda::getNumberOfBlocks( m, blockSize.x ) );
      gridSize.y = Devices::Cuda::getNumberOfBlocks( n, blockSize.y );

      GeamCudaKernel<<< gridSize, blockSize >>>(
            m, n,
            alpha, A, lda,
            beta, B, ldb,
            C, ldc );
      TNL_CHECK_CUDA_DEVICE;
#else
      throw Exceptions::CudaSupportMissing();
#endif
   }
};
+16 −16
Original line number Diff line number Diff line
@@ -151,14 +151,14 @@ BICGStabL< Matrix, Preconditioner >::solve( const Vector& b, Vector& x )
         beta = alpha * rho_1 / rho_0;
         rho_0 = rho_1;

         for( int i = 0; i <= j; i++ ) {
            u.bind( &U.getData()[ i * ldSize ], size );
            r_i.bind( &R.getData()[ i * ldSize ], size );
         /****
             * u_i := r_i - beta * u_i
          * U_[0:j] := R_[0:j] - beta * U_[0:j]
          */
            u.addVector( r_i, 1.0, -beta );
         }
         MatrixOperations< DeviceType >::
            geam( size, j + 1,
                  1.0, R.getData(), ldSize,
                  -beta, U.getData(), ldSize,
                  U.getData(), ldSize );

         /****
          * u_{j+1} = A u_j
@@ -170,14 +170,14 @@ BICGStabL< Matrix, Preconditioner >::solve( const Vector& b, Vector& x )
         gamma = r_ast.scalarProduct( Au );
         alpha = rho_0 / gamma;

         for( int i = 0; i <= j; i++ ) {
            r_i.bind( &R.getData()[ i * ldSize ], size );
            u.bind( &U.getData()[ (i + 1) * ldSize ], size );
         /****
             * r_i := r_i - alpha * u_{i+1}
          * R_[0:j] := R_[0:j] - alpha * U_[1:j+1]
          */
            r_i.addVector( u, -alpha );
         }
         MatrixOperations< DeviceType >::
            geam( size, j + 1,
                  1.0, R.getData(), ldSize,
                  -alpha, U.getData() + ldSize, ldSize,
                  R.getData(), ldSize );

         /****
          * r_{j+1} = A r_j