Merge branch 'JK/expression-templates' into 'develop' (9e0ea2ce) · Commits · TNL / tnl-dev

CMakeLists.txt

+10 −0

Original line number	Diff line number	Diff line
		@@ -127,6 +127,16 @@ if( CMAKE_CXX_COMPILER_ID STREQUAL "Clang" )
		set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-self-assign-overloaded" )
		endif()

		# enable address sanitizer (does not work with MPI due to many false positives, does not work with nvcc at all)
		if( CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang" )
		if( NOT ${WITH_MPI} AND NOT ${WITH_CUDA} )
		set( CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fsanitize=address -fsanitize=undefined -fno-omit-frame-pointer" )
		set( CMAKE_SHARED_LIBRARY_LINK_C_FLAGS_DEBUG "${CMAKE_SHARED_LIBRARY_LINK_C_FLAGS_DEBUG} -fsanitize=address -fsanitize=undefined" )
		set( CMAKE_EXE_LINKER_FLAGS_DEBUG "${CMAKE_EXE_LINKER_FLAGS_DEBUG} -fsanitize=address -fsanitize=undefined" )
		set( CMAKE_SHARED_LINKER_FLAGS_DEBUG "${CMAKE_SHARED_LINKER_FLAGS_DEBUG} -fsanitize=address -fsanitize=undefined" )
		endif()
		endif()

		# enable link time optimizations (but not in continuous integration)
		if( NOT DEFINED ENV{CI_JOB_NAME} )
		if( CMAKE_CXX_COMPILER_ID STREQUAL "GNU" )

src/Python/pytnl/tnl/Array.h

+5 −18

Original line number	Diff line number	Diff line
		@@ -4,25 +4,14 @@
		#include <pybind11/operators.h>
		namespace py = pybind11;

		// including pybind11/numpy.h is needed for the specializations of py::format_descriptor
		// for enum types, see https://github.com/pybind/pybind11/issues/2135
		#include <pybind11/numpy.h>

		#include "../tnl_indexing.h"

		#include <TNL/Containers/Array.h>


		// pybind11 should actually take care of this inside py::format_descriptor, but apparently it does not work...
		// see https://github.com/pybind/pybind11/issues/2135
		template< typename T, typename = void >
		struct underlying_type
		{
		using type = T;
		};
		template< typename T >
		struct underlying_type< T, std::enable_if_t< std::is_enum< T >::value > >
		{
		using type = std::underlying_type_t< T >;
		};


		template< typename ArrayType >
		void export_Array(py::module & m, const char* name)
		{
		@@ -64,9 +53,7 @@ void export_Array(py::module & m, const char* name)
		// Size of one scalar
		sizeof( typename ArrayType::ValueType ),
		// Python struct-style format descriptor
		py::format_descriptor<
		typename underlying_type< typename ArrayType::ValueType >::type
		>::format(),
		py::format_descriptor< typename ArrayType::ValueType >::format(),
		// Number of dimensions
		1,
		// Buffer dimensions

src/TNL/Algorithms/CudaReductionKernel.h

+71 −55

Original line number	Diff line number	Diff line
		@@ -15,7 +15,6 @@
		#include <TNL/Assert.h>
		#include <TNL/Math.h>
		#include <TNL/Cuda/DeviceInfo.h>
		#include <TNL/Cuda/SharedMemory.h>
		#include <TNL/Algorithms/CudaReductionBuffer.h>
		#include <TNL/Algorithms/MultiDeviceMemoryOperations.h>
		#include <TNL/Exceptions/CudaSupportMissing.h>
		@@ -40,6 +39,25 @@ static constexpr int Reduction_registersPerThread = 32; // empirically determi
		static constexpr int Reduction_minBlocksPerMultiprocessor = 8;
		#endif

		/*
		* nvcc (as of 10.2) is totally fucked up, in some cases it does not recognize the
		* std::plus<void>::operator() function to be constexpr and hence __host__ __device__
		* (for example, when the arguments are StaticVector<3, double> etc). Hence, we use
		* this wrapper which triggers only a warning and not an error as is the case when
		* the reduction functor is called from a __global__ or __device__ function. Let's
		* hope it works otherwise...
		*/
		template< typename Reduction, typename Arg1, typename Arg2 >
		__host__ __device__
		auto CudaReductionFunctorWrapper( Reduction&& reduction, Arg1&& arg1, Arg2&& arg2 )
		{
		// let's suppress the aforementioned warning...
		#pragma push
		#pragma diag_suppress 2979
		return std::forward<Reduction>(reduction)( std::forward<Arg1>(arg1), std::forward<Arg2>(arg2) );
		#pragma pop
		}

		template< int blockSize,
		typename Result,
		typename DataFetcher,
		@@ -54,7 +72,11 @@ CudaReductionKernel( const Result zero,
		const Index end,
		Result* output )
		{
		Result* sdata = Cuda::getSharedMemory< Result >();
		TNL_ASSERT_EQ( blockDim.x, blockSize, "unexpected block size in CudaReductionKernel" );
		// when there is only one warp per blockSize.x, we need to allocate two warps
		// worth of shared memory so that we don't index shared memory out of bounds
		constexpr int shmemElements = (blockSize <= 32) ? 2 * blockSize : blockSize;
		__shared__ Result sdata[shmemElements];

		// Get the thread id (tid), global thread id (gid) and gridSize.
		const Index tid = threadIdx.x;
		@@ -65,19 +87,19 @@ CudaReductionKernel( const Result zero,

		// Start with the sequential reduction and push the result into the shared memory.
		while( gid + 4 * gridSize < end ) {
		sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) );
		sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) );
		sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSize ) );
		sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSize ) );
		sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], dataFetcher( gid ) );
		sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], dataFetcher( gid + gridSize ) );
		sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], dataFetcher( gid + 2 * gridSize ) );
		sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], dataFetcher( gid + 3 * gridSize ) );
		gid += 4 * gridSize;
		}
		while( gid + 2 * gridSize < end ) {
		sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) );
		sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) );
		sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], dataFetcher( gid ) );
		sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], dataFetcher( gid + gridSize ) );
		gid += 2 * gridSize;
		}
		while( gid < end ) {
		sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) );
		sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], dataFetcher( gid ) );
		gid += gridSize;
		}
		__syncthreads();
		@@ -85,48 +107,48 @@ CudaReductionKernel( const Result zero,
		// Perform the parallel reduction.
		if( blockSize >= 1024 ) {
		if( tid < 512 )
		sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 512 ] );
		sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 512 ] );
		__syncthreads();
		}
		if( blockSize >= 512 ) {
		if( tid < 256 )
		sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 256 ] );
		sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 256 ] );
		__syncthreads();
		}
		if( blockSize >= 256 ) {
		if( tid < 128 )
		sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 128 ] );
		sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 128 ] );
		__syncthreads();
		}
		if( blockSize >= 128 ) {
		if( tid < 64 )
		sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 64 ] );
		sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 64 ] );
		__syncthreads();
		}

		// This runs in one warp so we use __syncwarp() instead of __syncthreads().
		if( tid < 32 ) {
		if( blockSize >= 64 )
		sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 32 ] );
		sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 32 ] );
		__syncwarp();
		// Note that here we do not have to check if tid < 16 etc, because we have
		// 2 * blockSize.x elements of shared memory per block, so we do not
		// access out of bounds. The results for the upper half will be undefined,
		// but unused anyway.
		if( blockSize >= 32 )
		sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 16 ] );
		sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 16 ] );
		__syncwarp();
		if( blockSize >= 16 )
		sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 8 ] );
		sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 8 ] );
		__syncwarp();
		if( blockSize >= 8 )
		sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 4 ] );
		sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 4 ] );
		__syncwarp();
		if( blockSize >= 4 )
		sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 2 ] );
		sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 2 ] );
		__syncwarp();
		if( blockSize >= 2 )
		sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 1 ] );
		sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 1 ] );
		}

		// Store the result back in the global memory.
		@@ -150,8 +172,12 @@ CudaReductionWithArgumentKernel( const Result zero,
		Index* idxOutput,
		const Index* idxInput = nullptr )
		{
		Result* sdata = Cuda::getSharedMemory< Result >();
		Index* sidx = reinterpret_cast< Index* >( &sdata[ blockDim.x ] );
		TNL_ASSERT_EQ( blockDim.x, blockSize, "unexpected block size in CudaReductionKernel" );
		// when there is only one warp per blockSize.x, we need to allocate two warps
		// worth of shared memory so that we don't index shared memory out of bounds
		constexpr int shmemElements = (blockSize <= 32) ? 2 * blockSize : blockSize;
		__shared__ Result sdata[shmemElements];
		__shared__ Index sidx[shmemElements];

		// Get the thread id (tid), global thread id (gid) and gridSize.
		const Index tid = threadIdx.x;
		@@ -409,12 +435,6 @@ struct CudaReductionKernelLauncher
		blockSize.x = Reduction_maxThreadsPerBlock;
		gridSize.x = TNL::min( Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize );

		// when there is only one warp per blockSize.x, we need to allocate two warps
		// worth of shared memory so that we don't index shared memory out of bounds
		const Index shmem = (blockSize.x <= 32)
		? 2 * blockSize.x * sizeof( Result )
		: blockSize.x * sizeof( Result );

		// This is "general", but this method always sets blockSize.x to a specific value,
		// so runtime switch is not necessary - it only prolongs the compilation time.
		/*
		@@ -423,55 +443,55 @@ struct CudaReductionKernelLauncher
		{
		case 512:
		CudaReductionKernel< 512 >
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
		<<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output);
		break;
		case 256:
		cudaFuncSetCacheConfig(CudaReductionKernel< 256, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

		CudaReductionKernel< 256 >
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
		<<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output);
		break;
		case 128:
		cudaFuncSetCacheConfig(CudaReductionKernel< 128, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

		CudaReductionKernel< 128 >
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
		<<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output);
		break;
		case 64:
		cudaFuncSetCacheConfig(CudaReductionKernel< 64, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

		CudaReductionKernel< 64 >
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
		<<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output);
		break;
		case 32:
		cudaFuncSetCacheConfig(CudaReductionKernel< 32, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

		CudaReductionKernel< 32 >
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
		<<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output);
		break;
		case 16:
		cudaFuncSetCacheConfig(CudaReductionKernel< 16, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

		CudaReductionKernel< 16 >
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
		<<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output);
		break;
		case 8:
		cudaFuncSetCacheConfig(CudaReductionKernel< 8, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

		CudaReductionKernel< 8 >
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
		<<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output);
		break;
		case 4:
		cudaFuncSetCacheConfig(CudaReductionKernel< 4, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

		CudaReductionKernel< 4 >
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
		<<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output);
		break;
		case 2:
		cudaFuncSetCacheConfig(CudaReductionKernel< 2, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

		CudaReductionKernel< 2 >
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
		<<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output);
		break;
		case 1:
		TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl );
		@@ -486,8 +506,9 @@ struct CudaReductionKernelLauncher
		if( blockSize.x == Reduction_maxThreadsPerBlock ) {
		cudaFuncSetCacheConfig(CudaReductionKernel< Reduction_maxThreadsPerBlock, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

		// shared memory is allocated statically inside the kernel
		CudaReductionKernel< Reduction_maxThreadsPerBlock >
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, begin, end, output);
		<<< gridSize, blockSize >>>( zero, dataFetcher, reduction, begin, end, output);
		cudaStreamSynchronize(0);
		TNL_CHECK_CUDA_DEVICE;
		}
		@@ -519,12 +540,6 @@ struct CudaReductionKernelLauncher
		blockSize.x = Reduction_maxThreadsPerBlock;
		gridSize.x = TNL::min( Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize );

		// when there is only one warp per blockSize.x, we need to allocate two warps
		// worth of shared memory so that we don't index shared memory out of bounds
		const Index shmem = (blockSize.x <= 32)
		? 2 * blockSize.x * ( sizeof( Result ) + sizeof( Index ) )
		: blockSize.x * ( sizeof( Result ) + sizeof( Index ) );

		// This is "general", but this method always sets blockSize.x to a specific value,
		// so runtime switch is not necessary - it only prolongs the compilation time.
		/*
		@@ -533,55 +548,55 @@ struct CudaReductionKernelLauncher
		{
		case 512:
		CudaReductionWithArgumentKernel< 512 >
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
		<<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
		break;
		case 256:
		cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 256, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

		CudaReductionWithArgumentKernel< 256 >
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
		<<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
		break;
		case 128:
		cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 128, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

		CudaReductionWithArgumentKernel< 128 >
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
		<<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
		break;
		case 64:
		cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 64, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

		CudaReductionWithArgumentKernel< 64 >
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
		<<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
		break;
		case 32:
		cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 32, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

		CudaReductionWithArgumentKernel< 32 >
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
		<<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
		break;
		case 16:
		cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 16, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

		CudaReductionWithArgumentKernel< 16 >
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
		<<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
		break;
		case 8:
		cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 8, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

		CudaReductionWithArgumentKernel< 8 >
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
		<<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
		break;
		case 4:
		cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 4, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

		CudaReductionWithArgumentKernel< 4 >
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
		<<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
		break;
		case 2:
		cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 2, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

		CudaReductionWithArgumentKernel< 2 >
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
		<<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
		break;
		case 1:
		TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl );
		@@ -596,8 +611,9 @@ struct CudaReductionKernelLauncher
		if( blockSize.x == Reduction_maxThreadsPerBlock ) {
		cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< Reduction_maxThreadsPerBlock, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

		// shared memory is allocated statically inside the kernel
		CudaReductionWithArgumentKernel< Reduction_maxThreadsPerBlock >
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, begin, end, output, idxOutput, idxInput );
		<<< gridSize, blockSize >>>( zero, dataFetcher, reduction, begin, end, output, idxOutput, idxInput );
		cudaStreamSynchronize(0);
		TNL_CHECK_CUDA_DEVICE;
		}

src/TNL/Containers/Expressions/DistributedVerticalOperations.h

+12 −0

Original line number	Diff line number	Diff line
		@@ -25,6 +25,8 @@ auto DistributedExpressionMin( const Expression& expression ) -> std::decay_t< d
		using ResultType = std::decay_t< decltype( expression[0] ) >;
		using CommunicatorType = typename Expression::CommunicatorType;

		static_assert( std::numeric_limits< ResultType >::is_specialized,
		"std::numeric_limits is not specialized for the reduction's result type" );
		ResultType result = std::numeric_limits< ResultType >::max();
		if( expression.getCommunicationGroup() != CommunicatorType::NullGroup ) {
		const ResultType localResult = ExpressionMin( expression.getConstLocalView() );
		@@ -42,6 +44,8 @@ auto DistributedExpressionArgMin( const Expression& expression )
		using ResultType = std::pair< RealType, IndexType >;
		using CommunicatorType = typename Expression::CommunicatorType;

		static_assert( std::numeric_limits< RealType >::is_specialized,
		"std::numeric_limits is not specialized for the reduction's real type" );
		ResultType result( -1, std::numeric_limits< RealType >::max() );
		const auto group = expression.getCommunicationGroup();
		if( group != CommunicatorType::NullGroup ) {
		@@ -82,6 +86,8 @@ auto DistributedExpressionMax( const Expression& expression ) -> std::decay_t< d
		using ResultType = std::decay_t< decltype( expression[0] ) >;
		using CommunicatorType = typename Expression::CommunicatorType;

		static_assert( std::numeric_limits< ResultType >::is_specialized,
		"std::numeric_limits is not specialized for the reduction's result type" );
		ResultType result = std::numeric_limits< ResultType >::lowest();
		if( expression.getCommunicationGroup() != CommunicatorType::NullGroup ) {
		const ResultType localResult = ExpressionMax( expression.getConstLocalView() );
		@@ -99,6 +105,8 @@ auto DistributedExpressionArgMax( const Expression& expression )
		using ResultType = std::pair< RealType, IndexType >;
		using CommunicatorType = typename Expression::CommunicatorType;

		static_assert( std::numeric_limits< RealType >::is_specialized,
		"std::numeric_limits is not specialized for the reduction's real type" );
		ResultType result( -1, std::numeric_limits< RealType >::lowest() );
		const auto group = expression.getCommunicationGroup();
		if( group != CommunicatorType::NullGroup ) {
		@@ -168,6 +176,8 @@ auto DistributedExpressionLogicalAnd( const Expression& expression ) -> std::dec
		using ResultType = std::decay_t< decltype( expression[0] && expression[0] ) >;
		using CommunicatorType = typename Expression::CommunicatorType;

		static_assert( std::numeric_limits< ResultType >::is_specialized,
		"std::numeric_limits is not specialized for the reduction's result type" );
		ResultType result = std::numeric_limits< ResultType >::max();
		if( expression.getCommunicationGroup() != CommunicatorType::NullGroup ) {
		const ResultType localResult = ExpressionLogicalAnd( expression.getConstLocalView() );
		@@ -196,6 +206,8 @@ auto DistributedExpressionBinaryAnd( const Expression& expression ) -> std::deca
		using ResultType = std::decay_t< decltype( expression[0] & expression[0] ) >;
		using CommunicatorType = typename Expression::CommunicatorType;

		static_assert( std::numeric_limits< ResultType >::is_specialized,
		"std::numeric_limits is not specialized for the reduction's result type" );
		ResultType result = std::numeric_limits< ResultType >::max();
		if( expression.getCommunicationGroup() != CommunicatorType::NullGroup ) {
		const ResultType localResult = ExpressionLogicalBinaryAnd( expression.getConstLocalView() );

src/TNL/Containers/Expressions/ExpressionVariableType.h

+1 −1

Original line number	Diff line number	Diff line
		@@ -22,7 +22,7 @@ template< typename T, typename V = T >
		constexpr ExpressionVariableType
		getExpressionVariableType()
		{
		if( std::is_arithmetic< std::decay_t< T > >::value )
		if( std::is_arithmetic< T >::value )
		return ArithmeticVariable;
		// vectors must be considered as an arithmetic type when used as RealType in another vector
		if( IsArithmeticSubtype< T, V >::value )