Commit 1c128e27 authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Parallelization of the CWYGMRES solver using OpenMP

parent 3a0d6bdc
Loading
Loading
Loading
Loading
+36 −14
Original line number Diff line number Diff line
#pragma once

#include <TNL/Devices/Host.h>
#include <TNL/Devices/Cuda.h>

namespace TNL {
namespace Containers {
namespace Algorithms {   

template< typename Device >
class Multireduction
{
};

template<>
class Multireduction< Devices::Cuda >
{
public:
   template< typename Operation >
bool multireductionOnCudaDevice( const Operation& operation,
   static bool
   reduce( Operation& operation,
           int n,
           const typename Operation::IndexType size,
           const typename Operation::RealType* deviceInput1,
           const typename Operation::IndexType ldInput1,
           const typename Operation::RealType* deviceInput2,
           typename Operation::ResultType* hostResult );
};

template<>
class Multireduction< Devices::Host >
{
public:
   template< typename Operation >
bool multireductionOnHostDevice( const Operation& operation,
   static bool
   reduce( Operation& operation,
           int n,
           const typename Operation::IndexType size,
           const typename Operation::RealType* deviceInput1,
           const typename Operation::IndexType ldInput1,
           const typename Operation::RealType* deviceInput2,
           typename Operation::ResultType* hostResult );
};

} // namespace Algorithms
} // namespace Containers
+95 −22
Original line number Diff line number Diff line
#pragma once

#include "Multireduction.h"

//#define CUDA_REDUCTION_PROFILING

#include <TNL/Assert.h>
@@ -34,7 +36,9 @@ static constexpr int Multireduction_minGpuDataSize = 256;//65536; //16384;//1024
 *    hostResult: output array of size = n
 */
template< typename Operation >
bool multireductionOnCudaDevice( Operation& operation,
bool
Multireduction< Devices::Cuda >::
reduce( Operation& operation,
        int n,
        const typename Operation::IndexType size,
        const typename Operation::RealType* deviceInput1,
@@ -44,6 +48,7 @@ bool multireductionOnCudaDevice( Operation& operation,
{
#ifdef HAVE_CUDA
   Assert( n > 0, );
   Assert( size <= ldInput1, );

   typedef typename Operation::IndexType IndexType;
   typedef typename Operation::RealType RealType;
@@ -62,10 +67,10 @@ bool multireductionOnCudaDevice( Operation& operation,
         RealType hostArray2[ Multireduction_minGpuDataSize ];
         if( ! ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< RealType, RealType, IndexType >( hostArray2, deviceInput2, n * size ) )
            return false;
         return multireductionOnHostDevice( operation, n, size, hostArray1, ldInput1, hostArray2, hostResult );
         return Multireduction< Devices::Host >::reduce( operation, n, size, hostArray1, ldInput1, hostArray2, hostResult );
      }
      else {
         return multireductionOnHostDevice( operation, n, size, hostArray1, ldInput1, nullptr, hostResult );
         return Multireduction< Devices::Host >::reduce( operation, n, size, hostArray1, ldInput1, nullptr, hostResult );
      }
   }

@@ -119,7 +124,7 @@ bool multireductionOnCudaDevice( Operation& operation,
    * Reduce the data on the host system.
    */
   LaterReductionOperation laterReductionOperation;
   multireductionOnHostDevice( laterReductionOperation, n, reducedSize, resultArray, reducedSize, (RealType*) nullptr, hostResult );
   Multireduction< Devices::Host >::reduce( laterReductionOperation, n, reducedSize, resultArray, reducedSize, (RealType*) nullptr, hostResult );

   #ifdef CUDA_REDUCTION_PROFILING
      timer.stop();
@@ -144,7 +149,9 @@ bool multireductionOnCudaDevice( Operation& operation,
 *    hostResult: output array of size = n
 */
template< typename Operation >
bool multireductionOnHostDevice( Operation& operation,
bool
Multireduction< Devices::Host >::
reduce( Operation& operation,
        int n,
        const typename Operation::IndexType size,
        const typename Operation::RealType* input1,
@@ -152,15 +159,81 @@ bool multireductionOnHostDevice( Operation& operation,
        const typename Operation::RealType* input2,
        typename Operation::ResultType* result )
{
   Assert( n > 0, );
   Assert( size <= ldInput1, );

   typedef typename Operation::IndexType IndexType;
   typedef typename Operation::RealType RealType;
   typedef typename Operation::ResultType ResultType;

   const int block_size = 128;
   const int blocks = size / block_size;

#ifdef HAVE_OPENMP
   if( TNL::Devices::Host::isOMPEnabled() && blocks >= 2 )
#pragma omp parallel
   {
      // first thread initializes the result array
      #pragma omp single nowait
      {
         for( int k = 0; k < n; k++ )
            result[ k ] = operation.initialValue();
      }

      // initialize array for thread-local results
      ResultType r[ n ];
      for( int k = 0; k < n; k++ )
         r[ k ] = operation.initialValue();

      #pragma omp for nowait
      for( int b = 0; b < blocks; b++ ) {
         const int offset = b * block_size;
         for( int k = 0; k < n; k++ ) {
            const RealType* _input1 = input1 + k * ldInput1;
            for( IndexType i = 0; i < block_size; i++ )
               r[ k ] = operation.reduceOnHost( offset + i, r[ k ], _input1, input2 );
         }
      }

      // the first thread that reaches here processes the last, incomplete block
      #pragma omp single nowait
      {
         for( int k = 0; k < n; k++ ) {
            const RealType* _input1 = input1 + k * ldInput1;
            for( IndexType i = blocks * block_size; i < size; i++ )
               r[ k ] = operation.reduceOnHost( i, r[ k ], _input1, input2 );
         }
      }

      // inter-thread reduction of local results
      for( int k = 0; k < n; k++ )
         #pragma omp critical
         {
            operation.commonReductionOnDevice( result[ k ], r[ k ] );
         }
   }
   else {
#endif
      for( int k = 0; k < n; k++ )
         result[ k ] = operation.initialValue();

      for( int b = 0; b < blocks; b++ ) {
         const int offset = b * block_size;
         for( int k = 0; k < n; k++ ) {
            const RealType* _input1 = input1 + k * ldInput1;
            for( IndexType i = 0; i < block_size; i++ )
               result[ k ] = operation.reduceOnHost( offset + i, result[ k ], _input1, input2 );
         }
      }

      for( int k = 0; k < n; k++ ) {
         const RealType* _input1 = input1 + k * ldInput1;
      for( IndexType i = 0; i < size; i++ )
         for( IndexType i = blocks * block_size; i < size; i++ )
            result[ k ] = operation.reduceOnHost( i, result[ k ], _input1, input2 );
      }
#ifdef HAVE_OPENMP
   }
#endif

   return true;
}
+6 −0
Original line number Diff line number Diff line
@@ -40,6 +40,9 @@ public:
      Assert( m <= lda, );

      if( beta != 0.0 ) {
#ifdef HAVE_OPENMP
#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() )
#endif
         for( IndexType j = 0; j < m; j++ ) {
            RealType tmp = 0.0;
            for( int k = 0; k < n; k++ )
@@ -49,6 +52,9 @@ public:
      }
      else {
         // the vector y might be uninitialized, and 0.0 * NaN = NaN
#ifdef HAVE_OPENMP
#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() )
#endif
         for( IndexType j = 0; j < m; j++ ) {
            RealType tmp = 0.0;
            for( int k = 0; k < n; k++ )
+24 −42
Original line number Diff line number Diff line
@@ -388,17 +388,9 @@ hauseholder_generate( DeviceVector& Y,
   if( i > 0 ) {
      // aux = Y_{i-1}^T * y_i
      RealType aux[ i ];
      if( std::is_same< DeviceType, Devices::Host >::value ) {
         // this is optimized better than multireductionOnHostDevice
         DeviceVector y_k;
         for( int k = 0; k < i; k++ ) {
            y_k.bind( &Y.getData()[ k * ldSize ], size );
            aux[ k ] = y_k.scalarProduct( y_i );
         }
      }
      if( std::is_same< DeviceType, Devices::Cuda >::value ) {
      Containers::Algorithms::tnlParallelReductionScalarProduct< RealType, IndexType > scalarProduct;
         if( ! multireductionOnCudaDevice( scalarProduct,
      if( ! Containers::Algorithms::Multireduction< DeviceType >::reduce
               ( scalarProduct,
                 i,
                 size,
                 Y.getData(),
@@ -406,10 +398,9 @@ hauseholder_generate( DeviceVector& Y,
                 y_i.getData(),
                 aux ) )
      {
            std::cerr << "multireductionOnCudaDevice failed" << std::endl;
         std::cerr << "multireduction failed" << std::endl;
         throw 1;
      }
      }

      // [T_i]_{0..i-1} = - T_{i-1} * t_i * aux
      for( int k = 0; k < i; k++ ) {
@@ -503,17 +494,9 @@ hauseholder_cwy_transposed( DeviceVector& z,
{
   // aux = Y_i^T * w
   RealType aux[ i + 1 ];
   if( std::is_same< DeviceType, Devices::Host >::value ) {
      // this is optimized better than multireductionOnHostDevice
      DeviceVector y_k;
      for( int k = 0; k <= i; k++ ) {
         y_k.bind( &Y.getData()[ k * ldSize ], size );
         aux[ k ] = y_k.scalarProduct( w );
      }
   }
   if( std::is_same< DeviceType, Devices::Cuda >::value ) {
   Containers::Algorithms::tnlParallelReductionScalarProduct< RealType, IndexType > scalarProduct;
      if( ! multireductionOnCudaDevice( scalarProduct,
   if( ! Containers::Algorithms::Multireduction< DeviceType >::reduce
            ( scalarProduct,
              i + 1,
              size,
              Y.getData(),
@@ -521,10 +504,9 @@ hauseholder_cwy_transposed( DeviceVector& z,
              w.getData(),
              aux ) )
   {
         std::cerr << "multireductionOnCudaDevice failed" << std::endl;
      std::cerr << "multireduction failed" << std::endl;
      throw 1;
   }
   }

   // aux = T_i^T * aux
   // Note that T_i^T is lower triangular, so we can overwrite the aux vector with the result in place