Commit 9e0ea2ce authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Merge branch 'JK/expression-templates' into 'develop'

Expression templates for nested vectors

Closes #60, #61, and #78

See merge request !68
parents 1fdc65ec bacbfb84
Loading
Loading
Loading
Loading
+10 −0
Original line number Diff line number Diff line
@@ -127,6 +127,16 @@ if( CMAKE_CXX_COMPILER_ID STREQUAL "Clang" )
   set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-self-assign-overloaded" )
endif()

# enable address sanitizer (does not work with MPI due to many false positives, does not work with nvcc at all)
if( CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang" )
   if( NOT ${WITH_MPI} AND NOT ${WITH_CUDA} )
      set( CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fsanitize=address -fsanitize=undefined -fno-omit-frame-pointer" )
      set( CMAKE_SHARED_LIBRARY_LINK_C_FLAGS_DEBUG "${CMAKE_SHARED_LIBRARY_LINK_C_FLAGS_DEBUG} -fsanitize=address -fsanitize=undefined" )
      set( CMAKE_EXE_LINKER_FLAGS_DEBUG "${CMAKE_EXE_LINKER_FLAGS_DEBUG} -fsanitize=address -fsanitize=undefined" )
      set( CMAKE_SHARED_LINKER_FLAGS_DEBUG "${CMAKE_SHARED_LINKER_FLAGS_DEBUG} -fsanitize=address -fsanitize=undefined" )
   endif()
endif()

# enable link time optimizations (but not in continuous integration)
if( NOT DEFINED ENV{CI_JOB_NAME} )
   if( CMAKE_CXX_COMPILER_ID STREQUAL "GNU" )
+5 −18
Original line number Diff line number Diff line
@@ -4,25 +4,14 @@
#include <pybind11/operators.h>
namespace py = pybind11;

// including pybind11/numpy.h is needed for the specializations of py::format_descriptor
// for enum types, see https://github.com/pybind/pybind11/issues/2135
#include <pybind11/numpy.h>

#include "../tnl_indexing.h"

#include <TNL/Containers/Array.h>


// pybind11 should actually take care of this inside py::format_descriptor, but apparently it does not work...
// see https://github.com/pybind/pybind11/issues/2135
template< typename T, typename = void >
struct underlying_type
{
   using type = T;
};
template< typename T >
struct underlying_type< T, std::enable_if_t< std::is_enum< T >::value > >
{
   using type = std::underlying_type_t< T >;
};


template< typename ArrayType >
void export_Array(py::module & m, const char* name)
{
@@ -64,9 +53,7 @@ void export_Array(py::module & m, const char* name)
                // Size of one scalar
                sizeof( typename ArrayType::ValueType ),
                // Python struct-style format descriptor
                py::format_descriptor<
                     typename underlying_type< typename ArrayType::ValueType >::type
                >::format(),
                py::format_descriptor< typename ArrayType::ValueType >::format(),
                // Number of dimensions
                1,
                // Buffer dimensions
+71 −55
Original line number Diff line number Diff line
@@ -15,7 +15,6 @@
#include <TNL/Assert.h>
#include <TNL/Math.h>
#include <TNL/Cuda/DeviceInfo.h>
#include <TNL/Cuda/SharedMemory.h>
#include <TNL/Algorithms/CudaReductionBuffer.h>
#include <TNL/Algorithms/MultiDeviceMemoryOperations.h>
#include <TNL/Exceptions/CudaSupportMissing.h>
@@ -40,6 +39,25 @@ static constexpr int Reduction_registersPerThread = 32; // empirically determi
   static constexpr int Reduction_minBlocksPerMultiprocessor = 8;
#endif

/*
 * nvcc (as of 10.2) is totally fucked up, in some cases it does not recognize the
 * std::plus<void>::operator() function to be constexpr and hence __host__ __device__
 * (for example, when the arguments are StaticVector<3, double> etc). Hence, we use
 * this wrapper which triggers only a warning and not an error as is the case when
 * the reduction functor is called from a __global__ or __device__ function. Let's
 * hope it works otherwise...
 */
template< typename Reduction, typename Arg1, typename Arg2 >
__host__ __device__
auto CudaReductionFunctorWrapper( Reduction&& reduction, Arg1&& arg1, Arg2&& arg2 )
{
// let's suppress the aforementioned warning...
#pragma push
#pragma diag_suppress 2979
   return std::forward<Reduction>(reduction)( std::forward<Arg1>(arg1), std::forward<Arg2>(arg2) );
#pragma pop
}

template< int blockSize,
          typename Result,
          typename DataFetcher,
@@ -54,7 +72,11 @@ CudaReductionKernel( const Result zero,
                     const Index end,
                     Result* output )
{
   Result* sdata = Cuda::getSharedMemory< Result >();
   TNL_ASSERT_EQ( blockDim.x, blockSize, "unexpected block size in CudaReductionKernel" );
   // when there is only one warp per blockSize.x, we need to allocate two warps
   // worth of shared memory so that we don't index shared memory out of bounds
   constexpr int shmemElements = (blockSize <= 32) ? 2 * blockSize : blockSize;
   __shared__ Result sdata[shmemElements];

   // Get the thread id (tid), global thread id (gid) and gridSize.
   const Index tid = threadIdx.x;
@@ -65,19 +87,19 @@ CudaReductionKernel( const Result zero,

   // Start with the sequential reduction and push the result into the shared memory.
   while( gid + 4 * gridSize < end ) {
      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) );
      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) );
      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSize ) );
      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSize ) );
      sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], dataFetcher( gid ) );
      sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], dataFetcher( gid + gridSize ) );
      sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], dataFetcher( gid + 2 * gridSize ) );
      sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], dataFetcher( gid + 3 * gridSize ) );
      gid += 4 * gridSize;
   }
   while( gid + 2 * gridSize < end ) {
      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) );
      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) );
      sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], dataFetcher( gid ) );
      sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], dataFetcher( gid + gridSize ) );
      gid += 2 * gridSize;
   }
   while( gid < end ) {
      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) );
      sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], dataFetcher( gid ) );
      gid += gridSize;
   }
   __syncthreads();
@@ -85,48 +107,48 @@ CudaReductionKernel( const Result zero,
   // Perform the parallel reduction.
   if( blockSize >= 1024 ) {
      if( tid < 512 )
         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 512 ] );
         sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 512 ] );
      __syncthreads();
   }
   if( blockSize >= 512 ) {
      if( tid < 256 )
         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 256 ] );
         sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 256 ] );
      __syncthreads();
   }
   if( blockSize >= 256 ) {
      if( tid < 128 )
         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 128 ] );
         sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 128 ] );
      __syncthreads();
   }
   if( blockSize >= 128 ) {
      if( tid <  64 )
         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 64 ] );
         sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 64 ] );
      __syncthreads();
   }

   // This runs in one warp so we use __syncwarp() instead of __syncthreads().
   if( tid < 32 ) {
      if( blockSize >= 64 )
         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 32 ] );
         sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 32 ] );
      __syncwarp();
      // Note that here we do not have to check if tid < 16 etc, because we have
      // 2 * blockSize.x elements of shared memory per block, so we do not
      // access out of bounds. The results for the upper half will be undefined,
      // but unused anyway.
      if( blockSize >= 32 )
         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 16 ] );
         sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 16 ] );
      __syncwarp();
      if( blockSize >= 16 )
         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 8 ] );
         sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 8 ] );
      __syncwarp();
      if( blockSize >=  8 )
         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 4 ] );
         sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 4 ] );
      __syncwarp();
      if( blockSize >=  4 )
         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 2 ] );
         sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 2 ] );
      __syncwarp();
      if( blockSize >=  2 )
         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 1 ] );
         sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 1 ] );
   }

   // Store the result back in the global memory.
@@ -150,8 +172,12 @@ CudaReductionWithArgumentKernel( const Result zero,
                                 Index* idxOutput,
                                 const Index* idxInput = nullptr )
{
   Result* sdata = Cuda::getSharedMemory< Result >();
   Index* sidx = reinterpret_cast< Index* >( &sdata[ blockDim.x ] );
   TNL_ASSERT_EQ( blockDim.x, blockSize, "unexpected block size in CudaReductionKernel" );
   // when there is only one warp per blockSize.x, we need to allocate two warps
   // worth of shared memory so that we don't index shared memory out of bounds
   constexpr int shmemElements = (blockSize <= 32) ? 2 * blockSize : blockSize;
   __shared__ Result sdata[shmemElements];
   __shared__ Index sidx[shmemElements];

   // Get the thread id (tid), global thread id (gid) and gridSize.
   const Index tid = threadIdx.x;
@@ -409,12 +435,6 @@ struct CudaReductionKernelLauncher
         blockSize.x = Reduction_maxThreadsPerBlock;
         gridSize.x = TNL::min( Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize );

         // when there is only one warp per blockSize.x, we need to allocate two warps
         // worth of shared memory so that we don't index shared memory out of bounds
         const Index shmem = (blockSize.x <= 32)
                  ? 2 * blockSize.x * sizeof( Result )
                  : blockSize.x * sizeof( Result );

         // This is "general", but this method always sets blockSize.x to a specific value,
         // so runtime switch is not necessary - it only prolongs the compilation time.
/*
@@ -423,55 +443,55 @@ struct CudaReductionKernelLauncher
         {
            case 512:
               CudaReductionKernel< 512 >
               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output);
               break;
            case 256:
               cudaFuncSetCacheConfig(CudaReductionKernel< 256, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

               CudaReductionKernel< 256 >
               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output);
               break;
            case 128:
               cudaFuncSetCacheConfig(CudaReductionKernel< 128, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

               CudaReductionKernel< 128 >
               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output);
               break;
            case  64:
               cudaFuncSetCacheConfig(CudaReductionKernel<  64, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

               CudaReductionKernel<  64 >
               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output);
               break;
            case  32:
               cudaFuncSetCacheConfig(CudaReductionKernel<  32, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

               CudaReductionKernel<  32 >
               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output);
               break;
            case  16:
               cudaFuncSetCacheConfig(CudaReductionKernel<  16, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

               CudaReductionKernel<  16 >
               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output);
               break;
           case   8:
               cudaFuncSetCacheConfig(CudaReductionKernel<   8, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

               CudaReductionKernel<   8 >
               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output);
               break;
            case   4:
               cudaFuncSetCacheConfig(CudaReductionKernel<   4, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

               CudaReductionKernel<   4 >
               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output);
               break;
            case   2:
               cudaFuncSetCacheConfig(CudaReductionKernel<   2, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

               CudaReductionKernel<   2 >
               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output);
               break;
            case   1:
               TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl );
@@ -486,8 +506,9 @@ struct CudaReductionKernelLauncher
         if( blockSize.x == Reduction_maxThreadsPerBlock ) {
            cudaFuncSetCacheConfig(CudaReductionKernel< Reduction_maxThreadsPerBlock, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

            // shared memory is allocated statically inside the kernel
            CudaReductionKernel< Reduction_maxThreadsPerBlock >
            <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, begin, end, output);
            <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, begin, end, output);
            cudaStreamSynchronize(0);
            TNL_CHECK_CUDA_DEVICE;
         }
@@ -519,12 +540,6 @@ struct CudaReductionKernelLauncher
         blockSize.x = Reduction_maxThreadsPerBlock;
         gridSize.x = TNL::min( Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize );

         // when there is only one warp per blockSize.x, we need to allocate two warps
         // worth of shared memory so that we don't index shared memory out of bounds
         const Index shmem = (blockSize.x <= 32)
                  ? 2 * blockSize.x * ( sizeof( Result ) + sizeof( Index ) )
                  : blockSize.x * ( sizeof( Result ) + sizeof( Index ) );

         // This is "general", but this method always sets blockSize.x to a specific value,
         // so runtime switch is not necessary - it only prolongs the compilation time.
/*
@@ -533,55 +548,55 @@ struct CudaReductionKernelLauncher
         {
            case 512:
               CudaReductionWithArgumentKernel< 512 >
               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
               break;
            case 256:
               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 256, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

               CudaReductionWithArgumentKernel< 256 >
               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
               break;
            case 128:
               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 128, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

               CudaReductionWithArgumentKernel< 128 >
               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
               break;
            case  64:
               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<  64, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

               CudaReductionWithArgumentKernel<  64 >
               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
               break;
            case  32:
               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<  32, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

               CudaReductionWithArgumentKernel<  32 >
               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
               break;
            case  16:
               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<  16, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

               CudaReductionWithArgumentKernel<  16 >
               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
               break;
           case   8:
               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<   8, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

               CudaReductionWithArgumentKernel<   8 >
               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
               break;
            case   4:
               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<   4, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

               CudaReductionWithArgumentKernel<   4 >
               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
               break;
            case   2:
               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<   2, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

               CudaReductionWithArgumentKernel<   2 >
               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
               break;
            case   1:
               TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl );
@@ -596,8 +611,9 @@ struct CudaReductionKernelLauncher
         if( blockSize.x == Reduction_maxThreadsPerBlock ) {
            cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< Reduction_maxThreadsPerBlock, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

            // shared memory is allocated statically inside the kernel
            CudaReductionWithArgumentKernel< Reduction_maxThreadsPerBlock >
            <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, begin, end, output, idxOutput, idxInput );
            <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, begin, end, output, idxOutput, idxInput );
            cudaStreamSynchronize(0);
            TNL_CHECK_CUDA_DEVICE;
         }
+12 −0
Original line number Diff line number Diff line
@@ -25,6 +25,8 @@ auto DistributedExpressionMin( const Expression& expression ) -> std::decay_t< d
   using ResultType = std::decay_t< decltype( expression[0] ) >;
   using CommunicatorType = typename Expression::CommunicatorType;

   static_assert( std::numeric_limits< ResultType >::is_specialized,
                  "std::numeric_limits is not specialized for the reduction's result type" );
   ResultType result = std::numeric_limits< ResultType >::max();
   if( expression.getCommunicationGroup() != CommunicatorType::NullGroup ) {
      const ResultType localResult = ExpressionMin( expression.getConstLocalView() );
@@ -42,6 +44,8 @@ auto DistributedExpressionArgMin( const Expression& expression )
   using ResultType = std::pair< RealType, IndexType >;
   using CommunicatorType = typename Expression::CommunicatorType;

   static_assert( std::numeric_limits< RealType >::is_specialized,
                  "std::numeric_limits is not specialized for the reduction's real type" );
   ResultType result( -1, std::numeric_limits< RealType >::max() );
   const auto group = expression.getCommunicationGroup();
   if( group != CommunicatorType::NullGroup ) {
@@ -82,6 +86,8 @@ auto DistributedExpressionMax( const Expression& expression ) -> std::decay_t< d
   using ResultType = std::decay_t< decltype( expression[0] ) >;
   using CommunicatorType = typename Expression::CommunicatorType;

   static_assert( std::numeric_limits< ResultType >::is_specialized,
                  "std::numeric_limits is not specialized for the reduction's result type" );
   ResultType result = std::numeric_limits< ResultType >::lowest();
   if( expression.getCommunicationGroup() != CommunicatorType::NullGroup ) {
      const ResultType localResult = ExpressionMax( expression.getConstLocalView() );
@@ -99,6 +105,8 @@ auto DistributedExpressionArgMax( const Expression& expression )
   using ResultType = std::pair< RealType, IndexType >;
   using CommunicatorType = typename Expression::CommunicatorType;

   static_assert( std::numeric_limits< RealType >::is_specialized,
                  "std::numeric_limits is not specialized for the reduction's real type" );
   ResultType result( -1, std::numeric_limits< RealType >::lowest() );
   const auto group = expression.getCommunicationGroup();
   if( group != CommunicatorType::NullGroup ) {
@@ -168,6 +176,8 @@ auto DistributedExpressionLogicalAnd( const Expression& expression ) -> std::dec
   using ResultType = std::decay_t< decltype( expression[0] && expression[0] ) >;
   using CommunicatorType = typename Expression::CommunicatorType;

   static_assert( std::numeric_limits< ResultType >::is_specialized,
                  "std::numeric_limits is not specialized for the reduction's result type" );
   ResultType result = std::numeric_limits< ResultType >::max();
   if( expression.getCommunicationGroup() != CommunicatorType::NullGroup ) {
      const ResultType localResult = ExpressionLogicalAnd( expression.getConstLocalView() );
@@ -196,6 +206,8 @@ auto DistributedExpressionBinaryAnd( const Expression& expression ) -> std::deca
   using ResultType = std::decay_t< decltype( expression[0] & expression[0] ) >;
   using CommunicatorType = typename Expression::CommunicatorType;

   static_assert( std::numeric_limits< ResultType >::is_specialized,
                  "std::numeric_limits is not specialized for the reduction's result type" );
   ResultType result = std::numeric_limits< ResultType >::max();
   if( expression.getCommunicationGroup() != CommunicatorType::NullGroup ) {
      const ResultType localResult = ExpressionLogicalBinaryAnd( expression.getConstLocalView() );
+1 −1
Original line number Diff line number Diff line
@@ -22,7 +22,7 @@ template< typename T, typename V = T >
constexpr ExpressionVariableType
getExpressionVariableType()
{
   if( std::is_arithmetic< std::decay_t< T > >::value )
   if( std::is_arithmetic< T >::value )
      return ArithmeticVariable;
   // vectors must be considered as an arithmetic type when used as RealType in another vector
   if( IsArithmeticSubtype< T, V >::value )
Loading