diff --git a/CMakeLists.txt b/CMakeLists.txt index b7182b3055cfa8c25f2fb601f9c776809fcff42c..528d89fab11aaba5500815e161cafca3ef72c984 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -202,7 +202,7 @@ endif() # Check for CUDA # if( ${WITH_CUDA} ) - find_package( CUDA 9.0 ) + find_package( CUDA 10.0 ) if( CUDA_FOUND ) set( BUILD_CUDA TRUE) set(CUDA_SEPARABLE_COMPILATION ON) @@ -229,6 +229,10 @@ if( ${WITH_CUDA} ) set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; -Wno-deprecated-gpu-targets -Xcudafe --diag_suppress=code_is_unreachable -Xcudafe --diag_suppress=loop_not_reachable -Xcudafe --diag_suppress=implicit_return_from_non_void_function -Xcudafe --diag_suppress=unsigned_compare_with_zero -Xcudafe --display_error_number) # This diagnostic is just plain wrong in CUDA 9 and later, see https://github.com/kokkos/kokkos/issues/1470 set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; -Xcudafe --diag_suppress=esa_on_defaulted_function_ignored) + # nvcc 10 causes many invalid VLA errors in the host code + if( ${CUDA_VERSION_MAJOR} STREQUAL "10" ) + set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -Wno-vla ) + endif() # Select GPU architecture ## cmake bug: cuda_select_nvcc_arch_flags does not work with CMAKE_EXECUTABLE_SUFFIX ## see https://gitlab.kitware.com/cmake/cmake/issues/19636 diff --git a/src/TNL/Algorithms/detail/Reduction.hpp b/src/TNL/Algorithms/detail/Reduction.hpp index dc7d59f4a13aa394aafd610d5acdb210f51426ea..abbfaa4b1548dad9d63df15525f35d126b4a6ede 100644 --- a/src/TNL/Algorithms/detail/Reduction.hpp +++ b/src/TNL/Algorithms/detail/Reduction.hpp @@ -302,20 +302,7 @@ Reduction< Devices::Cuda >::reduce( const Index begin, const Index end, Fetch&& if( can_reduce_later_on_host ) { // transfer the reduced data from device to host - std::unique_ptr< Result[] > resultArray{ -// Workaround for nvcc 10.1.168 - it would modify the simple expression -// `new Result[reducedSize]` in the source code to `new (Result[reducedSize])` -// which is not correct - see e.g. https://stackoverflow.com/a/39671946 -// Thus, the host compiler would spit out hundreds of warnings... -// Funnily enough, nvcc's behaviour depends on the context rather than the -// expression, because exactly the same simple expression in different places -// does not produce warnings. -#ifdef __NVCC__ - new Result[ static_cast< const int& >( reducedSize ) ] -#else - new Result[ reducedSize ] -#endif - }; + std::unique_ptr< Result[] > resultArray{ new Result[ reducedSize ] }; MultiDeviceMemoryOperations< void, Devices::Cuda >::copy( resultArray.get(), deviceAux1, reducedSize ); #ifdef CUDA_REDUCTION_PROFILING @@ -392,34 +379,8 @@ Reduction< Devices::Cuda >::reduceWithArgument( const Index begin, if( can_reduce_later_on_host ) { // transfer the reduced data from device to host - std::unique_ptr< Result[] > resultArray{ -// Workaround for nvcc 10.1.168 - it would modify the simple expression -// `new Result[reducedSize]` in the source code to `new (Result[reducedSize])` -// which is not correct - see e.g. https://stackoverflow.com/a/39671946 -// Thus, the host compiler would spit out hundreds of warnings... -// Funnily enough, nvcc's behaviour depends on the context rather than the -// expression, because exactly the same simple expression in different places -// does not produce warnings. -#ifdef __NVCC__ - new Result[ static_cast< const int& >( reducedSize ) ] -#else - new Result[ reducedSize ] -#endif - }; - std::unique_ptr< Index[] > indexArray{ -// Workaround for nvcc 10.1.168 - it would modify the simple expression -// `new Index[reducedSize]` in the source code to `new (Index[reducedSize])` -// which is not correct - see e.g. https://stackoverflow.com/a/39671946 -// Thus, the host compiler would spit out hundreds of warnings... -// Funnily enough, nvcc's behaviour depends on the context rather than the -// expression, because exactly the same simple expression in different places -// does not produce warnings. -#ifdef __NVCC__ - new Index[ static_cast< const int& >( reducedSize ) ] -#else - new Index[ reducedSize ] -#endif - }; + std::unique_ptr< Result[] > resultArray{ new Result[ reducedSize ] }; + std::unique_ptr< Index[] > indexArray{ new Index[ reducedSize ] }; MultiDeviceMemoryOperations< void, Devices::Cuda >::copy( resultArray.get(), deviceAux1, reducedSize ); MultiDeviceMemoryOperations< void, Devices::Cuda >::copy( indexArray.get(), deviceIndexes, reducedSize );