Fixed double-precision atomicAdd function for old GPUs (25b966e4) · Commits · TNL / tnl-dev

src/TNL/Devices/Cuda.h

+10 −2

Original line number	Diff line number	Diff line
		@@ -188,6 +188,14 @@ class Cuda
		std::ostream& operator << ( std::ostream& str, const dim3& d );
		#endif

		#ifdef HAVE_CUDA
		#if __CUDA_ARCH__ < 600
		namespace {
		__device__ double atomicAdd(double* address, double val);
		}
		#endif
		#endif

		} // namespace Devices
		} // namespace TNL

+24 −14

Original line number	Diff line number	Diff line
		@@ -160,20 +160,30 @@ __device__ Element* Cuda::getSharedMemory()
		return CudaSharedMemory< Element >();
		}

		// TODO: This is only for Kepler and older architectures. Fix it.
		__device__
		inline double atomicAdd(double* address, double val)
		{
		unsigned long long int* address_as_ull = ( unsigned long long int* ) address;
		// double-precision atomicAdd function for Maxwell and older GPUs
		// copied from: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions
		#if __CUDA_ARCH__ < 600
		namespace {
		__device__ double atomicAdd(double* address, double val)
		{
		unsigned long long int* address_as_ull =
		(unsigned long long int*)address;
		unsigned long long int old = *address_as_ull, assumed;
		do
		{

		do {
		assumed = old;
		old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double( assumed ) ) );
		}
		while( assumed != old );
		old = atomicCAS(address_as_ull, assumed,
		__double_as_longlong(val +
		__longlong_as_double(assumed)));

		// Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
		} while (assumed != old);

		return __longlong_as_double(old);
		}
		} // namespace
		#endif

		#endif /* HAVE_CUDA */

		} // namespace Devices