Loading CMakeLists.txt +5 −1 Original line number Original line Diff line number Diff line Loading @@ -202,7 +202,7 @@ endif() # Check for CUDA # Check for CUDA # # if( ${WITH_CUDA} ) if( ${WITH_CUDA} ) find_package( CUDA 9.0 ) find_package( CUDA 10.0 ) if( CUDA_FOUND ) if( CUDA_FOUND ) set( BUILD_CUDA TRUE) set( BUILD_CUDA TRUE) set(CUDA_SEPARABLE_COMPILATION ON) set(CUDA_SEPARABLE_COMPILATION ON) Loading @@ -229,6 +229,10 @@ if( ${WITH_CUDA} ) set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; -Wno-deprecated-gpu-targets -Xcudafe --diag_suppress=code_is_unreachable -Xcudafe --diag_suppress=loop_not_reachable -Xcudafe --diag_suppress=implicit_return_from_non_void_function -Xcudafe --diag_suppress=unsigned_compare_with_zero -Xcudafe --display_error_number) set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; -Wno-deprecated-gpu-targets -Xcudafe --diag_suppress=code_is_unreachable -Xcudafe --diag_suppress=loop_not_reachable -Xcudafe --diag_suppress=implicit_return_from_non_void_function -Xcudafe --diag_suppress=unsigned_compare_with_zero -Xcudafe --display_error_number) # This diagnostic is just plain wrong in CUDA 9 and later, see https://github.com/kokkos/kokkos/issues/1470 # This diagnostic is just plain wrong in CUDA 9 and later, see https://github.com/kokkos/kokkos/issues/1470 set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; -Xcudafe --diag_suppress=esa_on_defaulted_function_ignored) set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; -Xcudafe --diag_suppress=esa_on_defaulted_function_ignored) # nvcc 10 causes many invalid VLA errors in the host code if( ${CUDA_VERSION_MAJOR} STREQUAL "10" ) set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -Wno-vla ) endif() # Select GPU architecture # Select GPU architecture ## cmake bug: cuda_select_nvcc_arch_flags does not work with CMAKE_EXECUTABLE_SUFFIX ## cmake bug: cuda_select_nvcc_arch_flags does not work with CMAKE_EXECUTABLE_SUFFIX ## see https://gitlab.kitware.com/cmake/cmake/issues/19636 ## see https://gitlab.kitware.com/cmake/cmake/issues/19636 Loading src/TNL/Algorithms/detail/Reduction.hpp +3 −42 Original line number Original line Diff line number Diff line Loading @@ -302,20 +302,7 @@ Reduction< Devices::Cuda >::reduce( const Index begin, const Index end, Fetch&& if( can_reduce_later_on_host ) { if( can_reduce_later_on_host ) { // transfer the reduced data from device to host // transfer the reduced data from device to host std::unique_ptr< Result[] > resultArray{ std::unique_ptr< Result[] > resultArray{ new Result[ reducedSize ] }; // Workaround for nvcc 10.1.168 - it would modify the simple expression // `new Result[reducedSize]` in the source code to `new (Result[reducedSize])` // which is not correct - see e.g. https://stackoverflow.com/a/39671946 // Thus, the host compiler would spit out hundreds of warnings... // Funnily enough, nvcc's behaviour depends on the context rather than the // expression, because exactly the same simple expression in different places // does not produce warnings. #ifdef __NVCC__ new Result[ static_cast< const int& >( reducedSize ) ] #else new Result[ reducedSize ] #endif }; MultiDeviceMemoryOperations< void, Devices::Cuda >::copy( resultArray.get(), deviceAux1, reducedSize ); MultiDeviceMemoryOperations< void, Devices::Cuda >::copy( resultArray.get(), deviceAux1, reducedSize ); #ifdef CUDA_REDUCTION_PROFILING #ifdef CUDA_REDUCTION_PROFILING Loading Loading @@ -392,34 +379,8 @@ Reduction< Devices::Cuda >::reduceWithArgument( const Index begin, if( can_reduce_later_on_host ) { if( can_reduce_later_on_host ) { // transfer the reduced data from device to host // transfer the reduced data from device to host std::unique_ptr< Result[] > resultArray{ std::unique_ptr< Result[] > resultArray{ new Result[ reducedSize ] }; // Workaround for nvcc 10.1.168 - it would modify the simple expression std::unique_ptr< Index[] > indexArray{ new Index[ reducedSize ] }; // `new Result[reducedSize]` in the source code to `new (Result[reducedSize])` // which is not correct - see e.g. https://stackoverflow.com/a/39671946 // Thus, the host compiler would spit out hundreds of warnings... // Funnily enough, nvcc's behaviour depends on the context rather than the // expression, because exactly the same simple expression in different places // does not produce warnings. #ifdef __NVCC__ new Result[ static_cast< const int& >( reducedSize ) ] #else new Result[ reducedSize ] #endif }; std::unique_ptr< Index[] > indexArray{ // Workaround for nvcc 10.1.168 - it would modify the simple expression // `new Index[reducedSize]` in the source code to `new (Index[reducedSize])` // which is not correct - see e.g. https://stackoverflow.com/a/39671946 // Thus, the host compiler would spit out hundreds of warnings... // Funnily enough, nvcc's behaviour depends on the context rather than the // expression, because exactly the same simple expression in different places // does not produce warnings. #ifdef __NVCC__ new Index[ static_cast< const int& >( reducedSize ) ] #else new Index[ reducedSize ] #endif }; MultiDeviceMemoryOperations< void, Devices::Cuda >::copy( resultArray.get(), deviceAux1, reducedSize ); MultiDeviceMemoryOperations< void, Devices::Cuda >::copy( resultArray.get(), deviceAux1, reducedSize ); MultiDeviceMemoryOperations< void, Devices::Cuda >::copy( indexArray.get(), deviceIndexes, reducedSize ); MultiDeviceMemoryOperations< void, Devices::Cuda >::copy( indexArray.get(), deviceIndexes, reducedSize ); Loading Loading
CMakeLists.txt +5 −1 Original line number Original line Diff line number Diff line Loading @@ -202,7 +202,7 @@ endif() # Check for CUDA # Check for CUDA # # if( ${WITH_CUDA} ) if( ${WITH_CUDA} ) find_package( CUDA 9.0 ) find_package( CUDA 10.0 ) if( CUDA_FOUND ) if( CUDA_FOUND ) set( BUILD_CUDA TRUE) set( BUILD_CUDA TRUE) set(CUDA_SEPARABLE_COMPILATION ON) set(CUDA_SEPARABLE_COMPILATION ON) Loading @@ -229,6 +229,10 @@ if( ${WITH_CUDA} ) set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; -Wno-deprecated-gpu-targets -Xcudafe --diag_suppress=code_is_unreachable -Xcudafe --diag_suppress=loop_not_reachable -Xcudafe --diag_suppress=implicit_return_from_non_void_function -Xcudafe --diag_suppress=unsigned_compare_with_zero -Xcudafe --display_error_number) set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; -Wno-deprecated-gpu-targets -Xcudafe --diag_suppress=code_is_unreachable -Xcudafe --diag_suppress=loop_not_reachable -Xcudafe --diag_suppress=implicit_return_from_non_void_function -Xcudafe --diag_suppress=unsigned_compare_with_zero -Xcudafe --display_error_number) # This diagnostic is just plain wrong in CUDA 9 and later, see https://github.com/kokkos/kokkos/issues/1470 # This diagnostic is just plain wrong in CUDA 9 and later, see https://github.com/kokkos/kokkos/issues/1470 set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; -Xcudafe --diag_suppress=esa_on_defaulted_function_ignored) set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; -Xcudafe --diag_suppress=esa_on_defaulted_function_ignored) # nvcc 10 causes many invalid VLA errors in the host code if( ${CUDA_VERSION_MAJOR} STREQUAL "10" ) set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -Wno-vla ) endif() # Select GPU architecture # Select GPU architecture ## cmake bug: cuda_select_nvcc_arch_flags does not work with CMAKE_EXECUTABLE_SUFFIX ## cmake bug: cuda_select_nvcc_arch_flags does not work with CMAKE_EXECUTABLE_SUFFIX ## see https://gitlab.kitware.com/cmake/cmake/issues/19636 ## see https://gitlab.kitware.com/cmake/cmake/issues/19636 Loading
src/TNL/Algorithms/detail/Reduction.hpp +3 −42 Original line number Original line Diff line number Diff line Loading @@ -302,20 +302,7 @@ Reduction< Devices::Cuda >::reduce( const Index begin, const Index end, Fetch&& if( can_reduce_later_on_host ) { if( can_reduce_later_on_host ) { // transfer the reduced data from device to host // transfer the reduced data from device to host std::unique_ptr< Result[] > resultArray{ std::unique_ptr< Result[] > resultArray{ new Result[ reducedSize ] }; // Workaround for nvcc 10.1.168 - it would modify the simple expression // `new Result[reducedSize]` in the source code to `new (Result[reducedSize])` // which is not correct - see e.g. https://stackoverflow.com/a/39671946 // Thus, the host compiler would spit out hundreds of warnings... // Funnily enough, nvcc's behaviour depends on the context rather than the // expression, because exactly the same simple expression in different places // does not produce warnings. #ifdef __NVCC__ new Result[ static_cast< const int& >( reducedSize ) ] #else new Result[ reducedSize ] #endif }; MultiDeviceMemoryOperations< void, Devices::Cuda >::copy( resultArray.get(), deviceAux1, reducedSize ); MultiDeviceMemoryOperations< void, Devices::Cuda >::copy( resultArray.get(), deviceAux1, reducedSize ); #ifdef CUDA_REDUCTION_PROFILING #ifdef CUDA_REDUCTION_PROFILING Loading Loading @@ -392,34 +379,8 @@ Reduction< Devices::Cuda >::reduceWithArgument( const Index begin, if( can_reduce_later_on_host ) { if( can_reduce_later_on_host ) { // transfer the reduced data from device to host // transfer the reduced data from device to host std::unique_ptr< Result[] > resultArray{ std::unique_ptr< Result[] > resultArray{ new Result[ reducedSize ] }; // Workaround for nvcc 10.1.168 - it would modify the simple expression std::unique_ptr< Index[] > indexArray{ new Index[ reducedSize ] }; // `new Result[reducedSize]` in the source code to `new (Result[reducedSize])` // which is not correct - see e.g. https://stackoverflow.com/a/39671946 // Thus, the host compiler would spit out hundreds of warnings... // Funnily enough, nvcc's behaviour depends on the context rather than the // expression, because exactly the same simple expression in different places // does not produce warnings. #ifdef __NVCC__ new Result[ static_cast< const int& >( reducedSize ) ] #else new Result[ reducedSize ] #endif }; std::unique_ptr< Index[] > indexArray{ // Workaround for nvcc 10.1.168 - it would modify the simple expression // `new Index[reducedSize]` in the source code to `new (Index[reducedSize])` // which is not correct - see e.g. https://stackoverflow.com/a/39671946 // Thus, the host compiler would spit out hundreds of warnings... // Funnily enough, nvcc's behaviour depends on the context rather than the // expression, because exactly the same simple expression in different places // does not produce warnings. #ifdef __NVCC__ new Index[ static_cast< const int& >( reducedSize ) ] #else new Index[ reducedSize ] #endif }; MultiDeviceMemoryOperations< void, Devices::Cuda >::copy( resultArray.get(), deviceAux1, reducedSize ); MultiDeviceMemoryOperations< void, Devices::Cuda >::copy( resultArray.get(), deviceAux1, reducedSize ); MultiDeviceMemoryOperations< void, Devices::Cuda >::copy( indexArray.get(), deviceIndexes, reducedSize ); MultiDeviceMemoryOperations< void, Devices::Cuda >::copy( indexArray.get(), deviceIndexes, reducedSize ); Loading