Loading src/TNL/Atomic.h +27 −2 Original line number Diff line number Diff line Loading @@ -17,11 +17,36 @@ #include <TNL/Devices/Host.h> #include <TNL/Devices/Cuda.h> // double-precision atomicAdd function for Maxwell and older GPUs // copied from: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions #ifdef HAVE_CUDA #if __CUDA_ARCH__ < 600 namespace { __device__ double atomicAdd(double* address, double val) { unsigned long long int* address_as_ull = (unsigned long long int*)address; unsigned long long int old = *address_as_ull, assumed; do { assumed = old; old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) } while (assumed != old); return __longlong_as_double(old); } } // namespace #endif #endif namespace TNL { template< typename T, typename Device > class Atomic {}; class Atomic; template< typename T > class Atomic< T, Devices::Host > Loading src/TNL/Devices/Cuda.h +0 −8 Original line number Diff line number Diff line Loading @@ -36,14 +36,6 @@ public: static constexpr std::size_t TransferBufferSize = 5 * 2<<20; }; #ifdef HAVE_CUDA #if __CUDA_ARCH__ < 600 namespace { __device__ double atomicAdd(double* address, double val); } #endif #endif } // namespace Devices } // namespace TNL Loading src/TNL/Devices/Cuda_impl.h +0 −26 Original line number Diff line number Diff line Loading @@ -56,31 +56,5 @@ inline constexpr int Cuda::getGPUTransferBufferSize() return 1 << 20; } // double-precision atomicAdd function for Maxwell and older GPUs // copied from: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions #ifdef HAVE_CUDA #if __CUDA_ARCH__ < 600 namespace { __device__ double atomicAdd(double* address, double val) { unsigned long long int* address_as_ull = (unsigned long long int*)address; unsigned long long int old = *address_as_ull, assumed; do { assumed = old; old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) } while (assumed != old); return __longlong_as_double(old); } } // namespace #endif #endif } // namespace Devices } // namespace TNL Loading
src/TNL/Atomic.h +27 −2 Original line number Diff line number Diff line Loading @@ -17,11 +17,36 @@ #include <TNL/Devices/Host.h> #include <TNL/Devices/Cuda.h> // double-precision atomicAdd function for Maxwell and older GPUs // copied from: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions #ifdef HAVE_CUDA #if __CUDA_ARCH__ < 600 namespace { __device__ double atomicAdd(double* address, double val) { unsigned long long int* address_as_ull = (unsigned long long int*)address; unsigned long long int old = *address_as_ull, assumed; do { assumed = old; old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) } while (assumed != old); return __longlong_as_double(old); } } // namespace #endif #endif namespace TNL { template< typename T, typename Device > class Atomic {}; class Atomic; template< typename T > class Atomic< T, Devices::Host > Loading
src/TNL/Devices/Cuda.h +0 −8 Original line number Diff line number Diff line Loading @@ -36,14 +36,6 @@ public: static constexpr std::size_t TransferBufferSize = 5 * 2<<20; }; #ifdef HAVE_CUDA #if __CUDA_ARCH__ < 600 namespace { __device__ double atomicAdd(double* address, double val); } #endif #endif } // namespace Devices } // namespace TNL Loading
src/TNL/Devices/Cuda_impl.h +0 −26 Original line number Diff line number Diff line Loading @@ -56,31 +56,5 @@ inline constexpr int Cuda::getGPUTransferBufferSize() return 1 << 20; } // double-precision atomicAdd function for Maxwell and older GPUs // copied from: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions #ifdef HAVE_CUDA #if __CUDA_ARCH__ < 600 namespace { __device__ double atomicAdd(double* address, double val) { unsigned long long int* address_as_ull = (unsigned long long int*)address; unsigned long long int old = *address_as_ull, assumed; do { assumed = old; old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) } while (assumed != old); return __longlong_as_double(old); } } // namespace #endif #endif } // namespace Devices } // namespace TNL