Commit 25b966e4 authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Fixed double-precision atomicAdd function for old GPUs

parent 576216b8
Loading
Loading
Loading
Loading
+10 −2
Original line number Diff line number Diff line
@@ -188,6 +188,14 @@ class Cuda
std::ostream& operator << ( std::ostream& str, const dim3& d );
#endif

#ifdef HAVE_CUDA
#if __CUDA_ARCH__ < 600
namespace {
   __device__ double atomicAdd(double* address, double val);
}
#endif
#endif

} // namespace Devices
} // namespace TNL

+24 −14
Original line number Diff line number Diff line
@@ -160,20 +160,30 @@ __device__ Element* Cuda::getSharedMemory()
   return CudaSharedMemory< Element >();
}

// TODO: This is only for Kepler and older architectures. Fix it.
__device__ 
inline double atomicAdd(double* address, double val)
{
    unsigned long long int* address_as_ull = ( unsigned long long int* ) address;
// double-precision atomicAdd function for Maxwell and older GPUs
// copied from: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions
#if __CUDA_ARCH__ < 600
namespace {
   __device__ double atomicAdd(double* address, double val)
   {
       unsigned long long int* address_as_ull =
                                 (unsigned long long int*)address;
       unsigned long long int old = *address_as_ull, assumed;
    do 
    {

       do {
           assumed = old;
        old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double( assumed ) ) );
    } 
    while( assumed != old );
           old = atomicCAS(address_as_ull, assumed,
                           __double_as_longlong(val +
                                  __longlong_as_double(assumed)));

       // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
       } while (assumed != old);

       return __longlong_as_double(old);
   }
} // namespace
#endif

#endif /* HAVE_CUDA */

} // namespace Devices