Loading CMakeLists.txt +10 −0 Original line number Diff line number Diff line Loading @@ -127,6 +127,16 @@ if( CMAKE_CXX_COMPILER_ID STREQUAL "Clang" ) set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-self-assign-overloaded" ) endif() # enable address sanitizer (does not work with MPI due to many false positives, does not work with nvcc at all) if( CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang" ) if( NOT ${WITH_MPI} AND NOT ${WITH_CUDA} ) set( CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fsanitize=address -fsanitize=undefined -fno-omit-frame-pointer" ) set( CMAKE_SHARED_LIBRARY_LINK_C_FLAGS_DEBUG "${CMAKE_SHARED_LIBRARY_LINK_C_FLAGS_DEBUG} -fsanitize=address -fsanitize=undefined" ) set( CMAKE_EXE_LINKER_FLAGS_DEBUG "${CMAKE_EXE_LINKER_FLAGS_DEBUG} -fsanitize=address -fsanitize=undefined" ) set( CMAKE_SHARED_LINKER_FLAGS_DEBUG "${CMAKE_SHARED_LINKER_FLAGS_DEBUG} -fsanitize=address -fsanitize=undefined" ) endif() endif() # enable link time optimizations (but not in continuous integration) if( NOT DEFINED ENV{CI_JOB_NAME} ) if( CMAKE_CXX_COMPILER_ID STREQUAL "GNU" ) Loading src/Python/pytnl/tnl/Array.h +5 −18 Original line number Diff line number Diff line Loading @@ -4,25 +4,14 @@ #include <pybind11/operators.h> namespace py = pybind11; // including pybind11/numpy.h is needed for the specializations of py::format_descriptor // for enum types, see https://github.com/pybind/pybind11/issues/2135 #include <pybind11/numpy.h> #include "../tnl_indexing.h" #include <TNL/Containers/Array.h> // pybind11 should actually take care of this inside py::format_descriptor, but apparently it does not work... // see https://github.com/pybind/pybind11/issues/2135 template< typename T, typename = void > struct underlying_type { using type = T; }; template< typename T > struct underlying_type< T, std::enable_if_t< std::is_enum< T >::value > > { using type = std::underlying_type_t< T >; }; template< typename ArrayType > void export_Array(py::module & m, const char* name) { Loading Loading @@ -64,9 +53,7 @@ void export_Array(py::module & m, const char* name) // Size of one scalar sizeof( typename ArrayType::ValueType ), // Python struct-style format descriptor py::format_descriptor< typename underlying_type< typename ArrayType::ValueType >::type >::format(), py::format_descriptor< typename ArrayType::ValueType >::format(), // Number of dimensions 1, // Buffer dimensions Loading src/TNL/Algorithms/CudaReductionKernel.h +71 −55 Original line number Diff line number Diff line Loading @@ -15,7 +15,6 @@ #include <TNL/Assert.h> #include <TNL/Math.h> #include <TNL/Cuda/DeviceInfo.h> #include <TNL/Cuda/SharedMemory.h> #include <TNL/Algorithms/CudaReductionBuffer.h> #include <TNL/Algorithms/MultiDeviceMemoryOperations.h> #include <TNL/Exceptions/CudaSupportMissing.h> Loading @@ -40,6 +39,25 @@ static constexpr int Reduction_registersPerThread = 32; // empirically determi static constexpr int Reduction_minBlocksPerMultiprocessor = 8; #endif /* * nvcc (as of 10.2) is totally fucked up, in some cases it does not recognize the * std::plus<void>::operator() function to be constexpr and hence __host__ __device__ * (for example, when the arguments are StaticVector<3, double> etc). Hence, we use * this wrapper which triggers only a warning and not an error as is the case when * the reduction functor is called from a __global__ or __device__ function. Let's * hope it works otherwise... */ template< typename Reduction, typename Arg1, typename Arg2 > __host__ __device__ auto CudaReductionFunctorWrapper( Reduction&& reduction, Arg1&& arg1, Arg2&& arg2 ) { // let's suppress the aforementioned warning... #pragma push #pragma diag_suppress 2979 return std::forward<Reduction>(reduction)( std::forward<Arg1>(arg1), std::forward<Arg2>(arg2) ); #pragma pop } template< int blockSize, typename Result, typename DataFetcher, Loading @@ -54,7 +72,11 @@ CudaReductionKernel( const Result zero, const Index end, Result* output ) { Result* sdata = Cuda::getSharedMemory< Result >(); TNL_ASSERT_EQ( blockDim.x, blockSize, "unexpected block size in CudaReductionKernel" ); // when there is only one warp per blockSize.x, we need to allocate two warps // worth of shared memory so that we don't index shared memory out of bounds constexpr int shmemElements = (blockSize <= 32) ? 2 * blockSize : blockSize; __shared__ Result sdata[shmemElements]; // Get the thread id (tid), global thread id (gid) and gridSize. const Index tid = threadIdx.x; Loading @@ -65,19 +87,19 @@ CudaReductionKernel( const Result zero, // Start with the sequential reduction and push the result into the shared memory. while( gid + 4 * gridSize < end ) { sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) ); sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) ); sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSize ) ); sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSize ) ); sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], dataFetcher( gid ) ); sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], dataFetcher( gid + gridSize ) ); sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], dataFetcher( gid + 2 * gridSize ) ); sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], dataFetcher( gid + 3 * gridSize ) ); gid += 4 * gridSize; } while( gid + 2 * gridSize < end ) { sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) ); sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) ); sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], dataFetcher( gid ) ); sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], dataFetcher( gid + gridSize ) ); gid += 2 * gridSize; } while( gid < end ) { sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) ); sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], dataFetcher( gid ) ); gid += gridSize; } __syncthreads(); Loading @@ -85,48 +107,48 @@ CudaReductionKernel( const Result zero, // Perform the parallel reduction. if( blockSize >= 1024 ) { if( tid < 512 ) sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 512 ] ); sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 512 ] ); __syncthreads(); } if( blockSize >= 512 ) { if( tid < 256 ) sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 256 ] ); sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 256 ] ); __syncthreads(); } if( blockSize >= 256 ) { if( tid < 128 ) sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 128 ] ); sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 128 ] ); __syncthreads(); } if( blockSize >= 128 ) { if( tid < 64 ) sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 64 ] ); sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 64 ] ); __syncthreads(); } // This runs in one warp so we use __syncwarp() instead of __syncthreads(). if( tid < 32 ) { if( blockSize >= 64 ) sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 32 ] ); sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 32 ] ); __syncwarp(); // Note that here we do not have to check if tid < 16 etc, because we have // 2 * blockSize.x elements of shared memory per block, so we do not // access out of bounds. The results for the upper half will be undefined, // but unused anyway. if( blockSize >= 32 ) sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 16 ] ); sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 16 ] ); __syncwarp(); if( blockSize >= 16 ) sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 8 ] ); sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 8 ] ); __syncwarp(); if( blockSize >= 8 ) sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 4 ] ); sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 4 ] ); __syncwarp(); if( blockSize >= 4 ) sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 2 ] ); sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 2 ] ); __syncwarp(); if( blockSize >= 2 ) sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 1 ] ); sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 1 ] ); } // Store the result back in the global memory. Loading @@ -150,8 +172,12 @@ CudaReductionWithArgumentKernel( const Result zero, Index* idxOutput, const Index* idxInput = nullptr ) { Result* sdata = Cuda::getSharedMemory< Result >(); Index* sidx = reinterpret_cast< Index* >( &sdata[ blockDim.x ] ); TNL_ASSERT_EQ( blockDim.x, blockSize, "unexpected block size in CudaReductionKernel" ); // when there is only one warp per blockSize.x, we need to allocate two warps // worth of shared memory so that we don't index shared memory out of bounds constexpr int shmemElements = (blockSize <= 32) ? 2 * blockSize : blockSize; __shared__ Result sdata[shmemElements]; __shared__ Index sidx[shmemElements]; // Get the thread id (tid), global thread id (gid) and gridSize. const Index tid = threadIdx.x; Loading Loading @@ -409,12 +435,6 @@ struct CudaReductionKernelLauncher blockSize.x = Reduction_maxThreadsPerBlock; gridSize.x = TNL::min( Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize ); // when there is only one warp per blockSize.x, we need to allocate two warps // worth of shared memory so that we don't index shared memory out of bounds const Index shmem = (blockSize.x <= 32) ? 2 * blockSize.x * sizeof( Result ) : blockSize.x * sizeof( Result ); // This is "general", but this method always sets blockSize.x to a specific value, // so runtime switch is not necessary - it only prolongs the compilation time. /* Loading @@ -423,55 +443,55 @@ struct CudaReductionKernelLauncher { case 512: CudaReductionKernel< 512 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output); break; case 256: cudaFuncSetCacheConfig(CudaReductionKernel< 256, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 256 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output); break; case 128: cudaFuncSetCacheConfig(CudaReductionKernel< 128, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 128 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output); break; case 64: cudaFuncSetCacheConfig(CudaReductionKernel< 64, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 64 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output); break; case 32: cudaFuncSetCacheConfig(CudaReductionKernel< 32, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 32 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output); break; case 16: cudaFuncSetCacheConfig(CudaReductionKernel< 16, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 16 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output); break; case 8: cudaFuncSetCacheConfig(CudaReductionKernel< 8, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 8 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output); break; case 4: cudaFuncSetCacheConfig(CudaReductionKernel< 4, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 4 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output); break; case 2: cudaFuncSetCacheConfig(CudaReductionKernel< 2, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 2 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output); break; case 1: TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl ); Loading @@ -486,8 +506,9 @@ struct CudaReductionKernelLauncher if( blockSize.x == Reduction_maxThreadsPerBlock ) { cudaFuncSetCacheConfig(CudaReductionKernel< Reduction_maxThreadsPerBlock, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); // shared memory is allocated statically inside the kernel CudaReductionKernel< Reduction_maxThreadsPerBlock > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, begin, end, output); <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, begin, end, output); cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; } Loading Loading @@ -519,12 +540,6 @@ struct CudaReductionKernelLauncher blockSize.x = Reduction_maxThreadsPerBlock; gridSize.x = TNL::min( Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize ); // when there is only one warp per blockSize.x, we need to allocate two warps // worth of shared memory so that we don't index shared memory out of bounds const Index shmem = (blockSize.x <= 32) ? 2 * blockSize.x * ( sizeof( Result ) + sizeof( Index ) ) : blockSize.x * ( sizeof( Result ) + sizeof( Index ) ); // This is "general", but this method always sets blockSize.x to a specific value, // so runtime switch is not necessary - it only prolongs the compilation time. /* Loading @@ -533,55 +548,55 @@ struct CudaReductionKernelLauncher { case 512: CudaReductionWithArgumentKernel< 512 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); break; case 256: cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 256, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionWithArgumentKernel< 256 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); break; case 128: cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 128, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionWithArgumentKernel< 128 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); break; case 64: cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 64, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionWithArgumentKernel< 64 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); break; case 32: cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 32, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionWithArgumentKernel< 32 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); break; case 16: cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 16, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionWithArgumentKernel< 16 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); break; case 8: cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 8, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionWithArgumentKernel< 8 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); break; case 4: cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 4, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionWithArgumentKernel< 4 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); break; case 2: cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 2, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionWithArgumentKernel< 2 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); break; case 1: TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl ); Loading @@ -596,8 +611,9 @@ struct CudaReductionKernelLauncher if( blockSize.x == Reduction_maxThreadsPerBlock ) { cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< Reduction_maxThreadsPerBlock, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); // shared memory is allocated statically inside the kernel CudaReductionWithArgumentKernel< Reduction_maxThreadsPerBlock > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, begin, end, output, idxOutput, idxInput ); <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, begin, end, output, idxOutput, idxInput ); cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; } Loading src/TNL/Containers/Expressions/DistributedVerticalOperations.h +12 −0 Original line number Diff line number Diff line Loading @@ -25,6 +25,8 @@ auto DistributedExpressionMin( const Expression& expression ) -> std::decay_t< d using ResultType = std::decay_t< decltype( expression[0] ) >; using CommunicatorType = typename Expression::CommunicatorType; static_assert( std::numeric_limits< ResultType >::is_specialized, "std::numeric_limits is not specialized for the reduction's result type" ); ResultType result = std::numeric_limits< ResultType >::max(); if( expression.getCommunicationGroup() != CommunicatorType::NullGroup ) { const ResultType localResult = ExpressionMin( expression.getConstLocalView() ); Loading @@ -42,6 +44,8 @@ auto DistributedExpressionArgMin( const Expression& expression ) using ResultType = std::pair< RealType, IndexType >; using CommunicatorType = typename Expression::CommunicatorType; static_assert( std::numeric_limits< RealType >::is_specialized, "std::numeric_limits is not specialized for the reduction's real type" ); ResultType result( -1, std::numeric_limits< RealType >::max() ); const auto group = expression.getCommunicationGroup(); if( group != CommunicatorType::NullGroup ) { Loading Loading @@ -82,6 +86,8 @@ auto DistributedExpressionMax( const Expression& expression ) -> std::decay_t< d using ResultType = std::decay_t< decltype( expression[0] ) >; using CommunicatorType = typename Expression::CommunicatorType; static_assert( std::numeric_limits< ResultType >::is_specialized, "std::numeric_limits is not specialized for the reduction's result type" ); ResultType result = std::numeric_limits< ResultType >::lowest(); if( expression.getCommunicationGroup() != CommunicatorType::NullGroup ) { const ResultType localResult = ExpressionMax( expression.getConstLocalView() ); Loading @@ -99,6 +105,8 @@ auto DistributedExpressionArgMax( const Expression& expression ) using ResultType = std::pair< RealType, IndexType >; using CommunicatorType = typename Expression::CommunicatorType; static_assert( std::numeric_limits< RealType >::is_specialized, "std::numeric_limits is not specialized for the reduction's real type" ); ResultType result( -1, std::numeric_limits< RealType >::lowest() ); const auto group = expression.getCommunicationGroup(); if( group != CommunicatorType::NullGroup ) { Loading Loading @@ -168,6 +176,8 @@ auto DistributedExpressionLogicalAnd( const Expression& expression ) -> std::dec using ResultType = std::decay_t< decltype( expression[0] && expression[0] ) >; using CommunicatorType = typename Expression::CommunicatorType; static_assert( std::numeric_limits< ResultType >::is_specialized, "std::numeric_limits is not specialized for the reduction's result type" ); ResultType result = std::numeric_limits< ResultType >::max(); if( expression.getCommunicationGroup() != CommunicatorType::NullGroup ) { const ResultType localResult = ExpressionLogicalAnd( expression.getConstLocalView() ); Loading Loading @@ -196,6 +206,8 @@ auto DistributedExpressionBinaryAnd( const Expression& expression ) -> std::deca using ResultType = std::decay_t< decltype( expression[0] & expression[0] ) >; using CommunicatorType = typename Expression::CommunicatorType; static_assert( std::numeric_limits< ResultType >::is_specialized, "std::numeric_limits is not specialized for the reduction's result type" ); ResultType result = std::numeric_limits< ResultType >::max(); if( expression.getCommunicationGroup() != CommunicatorType::NullGroup ) { const ResultType localResult = ExpressionLogicalBinaryAnd( expression.getConstLocalView() ); Loading src/TNL/Containers/Expressions/ExpressionVariableType.h +1 −1 Original line number Diff line number Diff line Loading @@ -22,7 +22,7 @@ template< typename T, typename V = T > constexpr ExpressionVariableType getExpressionVariableType() { if( std::is_arithmetic< std::decay_t< T > >::value ) if( std::is_arithmetic< T >::value ) return ArithmeticVariable; // vectors must be considered as an arithmetic type when used as RealType in another vector if( IsArithmeticSubtype< T, V >::value ) Loading Loading
CMakeLists.txt +10 −0 Original line number Diff line number Diff line Loading @@ -127,6 +127,16 @@ if( CMAKE_CXX_COMPILER_ID STREQUAL "Clang" ) set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-self-assign-overloaded" ) endif() # enable address sanitizer (does not work with MPI due to many false positives, does not work with nvcc at all) if( CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang" ) if( NOT ${WITH_MPI} AND NOT ${WITH_CUDA} ) set( CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fsanitize=address -fsanitize=undefined -fno-omit-frame-pointer" ) set( CMAKE_SHARED_LIBRARY_LINK_C_FLAGS_DEBUG "${CMAKE_SHARED_LIBRARY_LINK_C_FLAGS_DEBUG} -fsanitize=address -fsanitize=undefined" ) set( CMAKE_EXE_LINKER_FLAGS_DEBUG "${CMAKE_EXE_LINKER_FLAGS_DEBUG} -fsanitize=address -fsanitize=undefined" ) set( CMAKE_SHARED_LINKER_FLAGS_DEBUG "${CMAKE_SHARED_LINKER_FLAGS_DEBUG} -fsanitize=address -fsanitize=undefined" ) endif() endif() # enable link time optimizations (but not in continuous integration) if( NOT DEFINED ENV{CI_JOB_NAME} ) if( CMAKE_CXX_COMPILER_ID STREQUAL "GNU" ) Loading
src/Python/pytnl/tnl/Array.h +5 −18 Original line number Diff line number Diff line Loading @@ -4,25 +4,14 @@ #include <pybind11/operators.h> namespace py = pybind11; // including pybind11/numpy.h is needed for the specializations of py::format_descriptor // for enum types, see https://github.com/pybind/pybind11/issues/2135 #include <pybind11/numpy.h> #include "../tnl_indexing.h" #include <TNL/Containers/Array.h> // pybind11 should actually take care of this inside py::format_descriptor, but apparently it does not work... // see https://github.com/pybind/pybind11/issues/2135 template< typename T, typename = void > struct underlying_type { using type = T; }; template< typename T > struct underlying_type< T, std::enable_if_t< std::is_enum< T >::value > > { using type = std::underlying_type_t< T >; }; template< typename ArrayType > void export_Array(py::module & m, const char* name) { Loading Loading @@ -64,9 +53,7 @@ void export_Array(py::module & m, const char* name) // Size of one scalar sizeof( typename ArrayType::ValueType ), // Python struct-style format descriptor py::format_descriptor< typename underlying_type< typename ArrayType::ValueType >::type >::format(), py::format_descriptor< typename ArrayType::ValueType >::format(), // Number of dimensions 1, // Buffer dimensions Loading
src/TNL/Algorithms/CudaReductionKernel.h +71 −55 Original line number Diff line number Diff line Loading @@ -15,7 +15,6 @@ #include <TNL/Assert.h> #include <TNL/Math.h> #include <TNL/Cuda/DeviceInfo.h> #include <TNL/Cuda/SharedMemory.h> #include <TNL/Algorithms/CudaReductionBuffer.h> #include <TNL/Algorithms/MultiDeviceMemoryOperations.h> #include <TNL/Exceptions/CudaSupportMissing.h> Loading @@ -40,6 +39,25 @@ static constexpr int Reduction_registersPerThread = 32; // empirically determi static constexpr int Reduction_minBlocksPerMultiprocessor = 8; #endif /* * nvcc (as of 10.2) is totally fucked up, in some cases it does not recognize the * std::plus<void>::operator() function to be constexpr and hence __host__ __device__ * (for example, when the arguments are StaticVector<3, double> etc). Hence, we use * this wrapper which triggers only a warning and not an error as is the case when * the reduction functor is called from a __global__ or __device__ function. Let's * hope it works otherwise... */ template< typename Reduction, typename Arg1, typename Arg2 > __host__ __device__ auto CudaReductionFunctorWrapper( Reduction&& reduction, Arg1&& arg1, Arg2&& arg2 ) { // let's suppress the aforementioned warning... #pragma push #pragma diag_suppress 2979 return std::forward<Reduction>(reduction)( std::forward<Arg1>(arg1), std::forward<Arg2>(arg2) ); #pragma pop } template< int blockSize, typename Result, typename DataFetcher, Loading @@ -54,7 +72,11 @@ CudaReductionKernel( const Result zero, const Index end, Result* output ) { Result* sdata = Cuda::getSharedMemory< Result >(); TNL_ASSERT_EQ( blockDim.x, blockSize, "unexpected block size in CudaReductionKernel" ); // when there is only one warp per blockSize.x, we need to allocate two warps // worth of shared memory so that we don't index shared memory out of bounds constexpr int shmemElements = (blockSize <= 32) ? 2 * blockSize : blockSize; __shared__ Result sdata[shmemElements]; // Get the thread id (tid), global thread id (gid) and gridSize. const Index tid = threadIdx.x; Loading @@ -65,19 +87,19 @@ CudaReductionKernel( const Result zero, // Start with the sequential reduction and push the result into the shared memory. while( gid + 4 * gridSize < end ) { sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) ); sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) ); sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSize ) ); sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSize ) ); sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], dataFetcher( gid ) ); sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], dataFetcher( gid + gridSize ) ); sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], dataFetcher( gid + 2 * gridSize ) ); sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], dataFetcher( gid + 3 * gridSize ) ); gid += 4 * gridSize; } while( gid + 2 * gridSize < end ) { sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) ); sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) ); sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], dataFetcher( gid ) ); sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], dataFetcher( gid + gridSize ) ); gid += 2 * gridSize; } while( gid < end ) { sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) ); sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], dataFetcher( gid ) ); gid += gridSize; } __syncthreads(); Loading @@ -85,48 +107,48 @@ CudaReductionKernel( const Result zero, // Perform the parallel reduction. if( blockSize >= 1024 ) { if( tid < 512 ) sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 512 ] ); sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 512 ] ); __syncthreads(); } if( blockSize >= 512 ) { if( tid < 256 ) sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 256 ] ); sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 256 ] ); __syncthreads(); } if( blockSize >= 256 ) { if( tid < 128 ) sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 128 ] ); sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 128 ] ); __syncthreads(); } if( blockSize >= 128 ) { if( tid < 64 ) sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 64 ] ); sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 64 ] ); __syncthreads(); } // This runs in one warp so we use __syncwarp() instead of __syncthreads(). if( tid < 32 ) { if( blockSize >= 64 ) sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 32 ] ); sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 32 ] ); __syncwarp(); // Note that here we do not have to check if tid < 16 etc, because we have // 2 * blockSize.x elements of shared memory per block, so we do not // access out of bounds. The results for the upper half will be undefined, // but unused anyway. if( blockSize >= 32 ) sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 16 ] ); sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 16 ] ); __syncwarp(); if( blockSize >= 16 ) sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 8 ] ); sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 8 ] ); __syncwarp(); if( blockSize >= 8 ) sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 4 ] ); sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 4 ] ); __syncwarp(); if( blockSize >= 4 ) sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 2 ] ); sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 2 ] ); __syncwarp(); if( blockSize >= 2 ) sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 1 ] ); sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 1 ] ); } // Store the result back in the global memory. Loading @@ -150,8 +172,12 @@ CudaReductionWithArgumentKernel( const Result zero, Index* idxOutput, const Index* idxInput = nullptr ) { Result* sdata = Cuda::getSharedMemory< Result >(); Index* sidx = reinterpret_cast< Index* >( &sdata[ blockDim.x ] ); TNL_ASSERT_EQ( blockDim.x, blockSize, "unexpected block size in CudaReductionKernel" ); // when there is only one warp per blockSize.x, we need to allocate two warps // worth of shared memory so that we don't index shared memory out of bounds constexpr int shmemElements = (blockSize <= 32) ? 2 * blockSize : blockSize; __shared__ Result sdata[shmemElements]; __shared__ Index sidx[shmemElements]; // Get the thread id (tid), global thread id (gid) and gridSize. const Index tid = threadIdx.x; Loading Loading @@ -409,12 +435,6 @@ struct CudaReductionKernelLauncher blockSize.x = Reduction_maxThreadsPerBlock; gridSize.x = TNL::min( Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize ); // when there is only one warp per blockSize.x, we need to allocate two warps // worth of shared memory so that we don't index shared memory out of bounds const Index shmem = (blockSize.x <= 32) ? 2 * blockSize.x * sizeof( Result ) : blockSize.x * sizeof( Result ); // This is "general", but this method always sets blockSize.x to a specific value, // so runtime switch is not necessary - it only prolongs the compilation time. /* Loading @@ -423,55 +443,55 @@ struct CudaReductionKernelLauncher { case 512: CudaReductionKernel< 512 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output); break; case 256: cudaFuncSetCacheConfig(CudaReductionKernel< 256, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 256 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output); break; case 128: cudaFuncSetCacheConfig(CudaReductionKernel< 128, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 128 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output); break; case 64: cudaFuncSetCacheConfig(CudaReductionKernel< 64, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 64 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output); break; case 32: cudaFuncSetCacheConfig(CudaReductionKernel< 32, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 32 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output); break; case 16: cudaFuncSetCacheConfig(CudaReductionKernel< 16, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 16 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output); break; case 8: cudaFuncSetCacheConfig(CudaReductionKernel< 8, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 8 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output); break; case 4: cudaFuncSetCacheConfig(CudaReductionKernel< 4, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 4 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output); break; case 2: cudaFuncSetCacheConfig(CudaReductionKernel< 2, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 2 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output); break; case 1: TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl ); Loading @@ -486,8 +506,9 @@ struct CudaReductionKernelLauncher if( blockSize.x == Reduction_maxThreadsPerBlock ) { cudaFuncSetCacheConfig(CudaReductionKernel< Reduction_maxThreadsPerBlock, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); // shared memory is allocated statically inside the kernel CudaReductionKernel< Reduction_maxThreadsPerBlock > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, begin, end, output); <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, begin, end, output); cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; } Loading Loading @@ -519,12 +540,6 @@ struct CudaReductionKernelLauncher blockSize.x = Reduction_maxThreadsPerBlock; gridSize.x = TNL::min( Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize ); // when there is only one warp per blockSize.x, we need to allocate two warps // worth of shared memory so that we don't index shared memory out of bounds const Index shmem = (blockSize.x <= 32) ? 2 * blockSize.x * ( sizeof( Result ) + sizeof( Index ) ) : blockSize.x * ( sizeof( Result ) + sizeof( Index ) ); // This is "general", but this method always sets blockSize.x to a specific value, // so runtime switch is not necessary - it only prolongs the compilation time. /* Loading @@ -533,55 +548,55 @@ struct CudaReductionKernelLauncher { case 512: CudaReductionWithArgumentKernel< 512 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); break; case 256: cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 256, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionWithArgumentKernel< 256 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); break; case 128: cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 128, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionWithArgumentKernel< 128 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); break; case 64: cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 64, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionWithArgumentKernel< 64 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); break; case 32: cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 32, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionWithArgumentKernel< 32 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); break; case 16: cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 16, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionWithArgumentKernel< 16 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); break; case 8: cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 8, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionWithArgumentKernel< 8 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); break; case 4: cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 4, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionWithArgumentKernel< 4 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); break; case 2: cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 2, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionWithArgumentKernel< 2 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); break; case 1: TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl ); Loading @@ -596,8 +611,9 @@ struct CudaReductionKernelLauncher if( blockSize.x == Reduction_maxThreadsPerBlock ) { cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< Reduction_maxThreadsPerBlock, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); // shared memory is allocated statically inside the kernel CudaReductionWithArgumentKernel< Reduction_maxThreadsPerBlock > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, begin, end, output, idxOutput, idxInput ); <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, begin, end, output, idxOutput, idxInput ); cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; } Loading
src/TNL/Containers/Expressions/DistributedVerticalOperations.h +12 −0 Original line number Diff line number Diff line Loading @@ -25,6 +25,8 @@ auto DistributedExpressionMin( const Expression& expression ) -> std::decay_t< d using ResultType = std::decay_t< decltype( expression[0] ) >; using CommunicatorType = typename Expression::CommunicatorType; static_assert( std::numeric_limits< ResultType >::is_specialized, "std::numeric_limits is not specialized for the reduction's result type" ); ResultType result = std::numeric_limits< ResultType >::max(); if( expression.getCommunicationGroup() != CommunicatorType::NullGroup ) { const ResultType localResult = ExpressionMin( expression.getConstLocalView() ); Loading @@ -42,6 +44,8 @@ auto DistributedExpressionArgMin( const Expression& expression ) using ResultType = std::pair< RealType, IndexType >; using CommunicatorType = typename Expression::CommunicatorType; static_assert( std::numeric_limits< RealType >::is_specialized, "std::numeric_limits is not specialized for the reduction's real type" ); ResultType result( -1, std::numeric_limits< RealType >::max() ); const auto group = expression.getCommunicationGroup(); if( group != CommunicatorType::NullGroup ) { Loading Loading @@ -82,6 +86,8 @@ auto DistributedExpressionMax( const Expression& expression ) -> std::decay_t< d using ResultType = std::decay_t< decltype( expression[0] ) >; using CommunicatorType = typename Expression::CommunicatorType; static_assert( std::numeric_limits< ResultType >::is_specialized, "std::numeric_limits is not specialized for the reduction's result type" ); ResultType result = std::numeric_limits< ResultType >::lowest(); if( expression.getCommunicationGroup() != CommunicatorType::NullGroup ) { const ResultType localResult = ExpressionMax( expression.getConstLocalView() ); Loading @@ -99,6 +105,8 @@ auto DistributedExpressionArgMax( const Expression& expression ) using ResultType = std::pair< RealType, IndexType >; using CommunicatorType = typename Expression::CommunicatorType; static_assert( std::numeric_limits< RealType >::is_specialized, "std::numeric_limits is not specialized for the reduction's real type" ); ResultType result( -1, std::numeric_limits< RealType >::lowest() ); const auto group = expression.getCommunicationGroup(); if( group != CommunicatorType::NullGroup ) { Loading Loading @@ -168,6 +176,8 @@ auto DistributedExpressionLogicalAnd( const Expression& expression ) -> std::dec using ResultType = std::decay_t< decltype( expression[0] && expression[0] ) >; using CommunicatorType = typename Expression::CommunicatorType; static_assert( std::numeric_limits< ResultType >::is_specialized, "std::numeric_limits is not specialized for the reduction's result type" ); ResultType result = std::numeric_limits< ResultType >::max(); if( expression.getCommunicationGroup() != CommunicatorType::NullGroup ) { const ResultType localResult = ExpressionLogicalAnd( expression.getConstLocalView() ); Loading Loading @@ -196,6 +206,8 @@ auto DistributedExpressionBinaryAnd( const Expression& expression ) -> std::deca using ResultType = std::decay_t< decltype( expression[0] & expression[0] ) >; using CommunicatorType = typename Expression::CommunicatorType; static_assert( std::numeric_limits< ResultType >::is_specialized, "std::numeric_limits is not specialized for the reduction's result type" ); ResultType result = std::numeric_limits< ResultType >::max(); if( expression.getCommunicationGroup() != CommunicatorType::NullGroup ) { const ResultType localResult = ExpressionLogicalBinaryAnd( expression.getConstLocalView() ); Loading
src/TNL/Containers/Expressions/ExpressionVariableType.h +1 −1 Original line number Diff line number Diff line Loading @@ -22,7 +22,7 @@ template< typename T, typename V = T > constexpr ExpressionVariableType getExpressionVariableType() { if( std::is_arithmetic< std::decay_t< T > >::value ) if( std::is_arithmetic< T >::value ) return ArithmeticVariable; // vectors must be considered as an arithmetic type when used as RealType in another vector if( IsArithmeticSubtype< T, V >::value ) Loading