diff --git a/src/TNL/Algorithms/AtomicOperations.h b/src/TNL/Algorithms/AtomicOperations.h index db67f5da8f395ef9e92f6a911138b8044a41b74a..6da10e2acab1c1429f854053ce06361bda540726 100644 --- a/src/TNL/Algorithms/AtomicOperations.h +++ b/src/TNL/Algorithms/AtomicOperations.h @@ -8,19 +8,13 @@ #pragma once -#ifdef HAVE_CUDA - #include <cuda.h> -#endif -#include <TNL/Devices/Sequential.h> -#include <TNL/Devices/Host.h> -#include <TNL/Devices/Cuda.h> +#include <TNL/Atomic.h> namespace TNL { namespace Algorithms { template< typename Device > -struct AtomicOperations -{}; +struct AtomicOperations; template<> struct AtomicOperations< Devices::Host > @@ -72,40 +66,9 @@ struct AtomicOperations< Devices::Cuda > { #ifdef HAVE_CUDA return atomicAdd( &v, a ); -#endif - } - -#ifdef HAVE_CUDA - __device__ - static double - add( double& v, const double& a ) - { - #if __CUDA_ARCH__ < 600 - unsigned long long int* v_as_ull = (unsigned long long int*) &v; - unsigned long long int old = *v_as_ull, assumed; - - do { - assumed = old; - old = atomicCAS( v_as_ull, assumed, __double_as_longlong( a + __longlong_as_double( assumed ) ) ); - - // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) - } while( assumed != old ); - - return old; - #else // __CUDA_ARCH__ < 600 - return atomicAdd( &v, a ); - #endif //__CUDA_ARCH__ < 600 - } -#endif // HAVE_CUDA - - __cuda_callable__ - static long int - add( long int& v, const long int& a ) - { -#ifdef HAVE_CUDA - TNL_ASSERT_TRUE( false, "Atomic add for long int is not supported on CUDA." ); -#endif // HAVE_CUDA +#else return 0; +#endif } __cuda_callable__ @@ -114,9 +77,10 @@ struct AtomicOperations< Devices::Cuda > { #ifdef HAVE_CUDA TNL_ASSERT_TRUE( false, "Atomic add for short int is not supported on CUDA." ); -#endif // HAVE_CUDA +#endif return 0; } }; + } // namespace Algorithms } // namespace TNL diff --git a/src/TNL/Atomic.h b/src/TNL/Atomic.h index 94ac5dfc8cf4993f35a03f81a5d3e1ccbcd3c344..2f457d74c81f30fcccf8bef5fbe4e4c4a50869ea 100644 --- a/src/TNL/Atomic.h +++ b/src/TNL/Atomic.h @@ -14,11 +14,12 @@ #include <TNL/Devices/Sequential.h> #include <TNL/Devices/Cuda.h> -// double-precision atomicAdd function for Maxwell and older GPUs -// copied from: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions #ifdef HAVE_CUDA - #if __CUDA_ARCH__ < 600 namespace { + +// double-precision atomicAdd function for Maxwell and older GPUs +// copied from: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions + #if defined( __CUDA_ARCH__ ) && __CUDA_ARCH__ < 600 __device__ double atomicAdd( double* address, double val ) @@ -35,8 +36,28 @@ atomicAdd( double* address, double val ) return __longlong_as_double( old ); } -} // namespace #endif + +__device__ +long int +atomicAdd( long int* address, long int val ) +{ + unsigned long long int* address_as_unsigned = reinterpret_cast< unsigned long long int* >( address ); + long int old = *address; + long int assumed; + + do { + assumed = old; + long int sum = val + assumed; + old = atomicCAS( address_as_unsigned, + *reinterpret_cast< unsigned long long int* >( &assumed ), + *reinterpret_cast< unsigned long long int* >( &sum ) ); + } while( assumed != old ); + + return old; +} + +} // namespace #endif namespace TNL {