Loading src/TNL/Devices/Cuda.h +10 −2 Original line number Diff line number Diff line Loading @@ -188,6 +188,14 @@ class Cuda std::ostream& operator << ( std::ostream& str, const dim3& d ); #endif #ifdef HAVE_CUDA #if __CUDA_ARCH__ < 600 namespace { __device__ double atomicAdd(double* address, double val); } #endif #endif } // namespace Devices } // namespace TNL Loading src/TNL/Devices/Cuda_impl.h +24 −14 Original line number Diff line number Diff line Loading @@ -160,20 +160,30 @@ __device__ Element* Cuda::getSharedMemory() return CudaSharedMemory< Element >(); } // TODO: This is only for Kepler and older architectures. Fix it. __device__ inline double atomicAdd(double* address, double val) { unsigned long long int* address_as_ull = ( unsigned long long int* ) address; // double-precision atomicAdd function for Maxwell and older GPUs // copied from: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions #if __CUDA_ARCH__ < 600 namespace { __device__ double atomicAdd(double* address, double val) { unsigned long long int* address_as_ull = (unsigned long long int*)address; unsigned long long int old = *address_as_ull, assumed; do { do { assumed = old; old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double( assumed ) ) ); } while( assumed != old ); old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) } while (assumed != old); return __longlong_as_double(old); } } // namespace #endif #endif /* HAVE_CUDA */ } // namespace Devices Loading Loading
src/TNL/Devices/Cuda.h +10 −2 Original line number Diff line number Diff line Loading @@ -188,6 +188,14 @@ class Cuda std::ostream& operator << ( std::ostream& str, const dim3& d ); #endif #ifdef HAVE_CUDA #if __CUDA_ARCH__ < 600 namespace { __device__ double atomicAdd(double* address, double val); } #endif #endif } // namespace Devices } // namespace TNL Loading
src/TNL/Devices/Cuda_impl.h +24 −14 Original line number Diff line number Diff line Loading @@ -160,20 +160,30 @@ __device__ Element* Cuda::getSharedMemory() return CudaSharedMemory< Element >(); } // TODO: This is only for Kepler and older architectures. Fix it. __device__ inline double atomicAdd(double* address, double val) { unsigned long long int* address_as_ull = ( unsigned long long int* ) address; // double-precision atomicAdd function for Maxwell and older GPUs // copied from: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions #if __CUDA_ARCH__ < 600 namespace { __device__ double atomicAdd(double* address, double val) { unsigned long long int* address_as_ull = (unsigned long long int*)address; unsigned long long int old = *address_as_ull, assumed; do { do { assumed = old; old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double( assumed ) ) ); } while( assumed != old ); old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) } while (assumed != old); return __longlong_as_double(old); } } // namespace #endif #endif /* HAVE_CUDA */ } // namespace Devices Loading