Loading src/TNL/Algorithms/AtomicOperations.h +6 −42 Original line number Original line Diff line number Diff line Loading @@ -8,19 +8,13 @@ #pragma once #pragma once #ifdef HAVE_CUDA #include <TNL/Atomic.h> #include <cuda.h> #endif #include <TNL/Devices/Sequential.h> #include <TNL/Devices/Host.h> #include <TNL/Devices/Cuda.h> namespace TNL { namespace TNL { namespace Algorithms { namespace Algorithms { template< typename Device > template< typename Device > struct AtomicOperations struct AtomicOperations; {}; template<> template<> struct AtomicOperations< Devices::Host > struct AtomicOperations< Devices::Host > Loading Loading @@ -72,40 +66,9 @@ struct AtomicOperations< Devices::Cuda > { { #ifdef HAVE_CUDA #ifdef HAVE_CUDA return atomicAdd( &v, a ); return atomicAdd( &v, a ); #endif #else } #ifdef HAVE_CUDA __device__ static double add( double& v, const double& a ) { #if __CUDA_ARCH__ < 600 unsigned long long int* v_as_ull = (unsigned long long int*) &v; unsigned long long int old = *v_as_ull, assumed; do { assumed = old; old = atomicCAS( v_as_ull, assumed, __double_as_longlong( a + __longlong_as_double( assumed ) ) ); // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) } while( assumed != old ); return old; #else // __CUDA_ARCH__ < 600 return atomicAdd( &v, a ); #endif //__CUDA_ARCH__ < 600 } #endif // HAVE_CUDA __cuda_callable__ static long int add( long int& v, const long int& a ) { #ifdef HAVE_CUDA TNL_ASSERT_TRUE( false, "Atomic add for long int is not supported on CUDA." ); #endif // HAVE_CUDA return 0; return 0; #endif } } __cuda_callable__ __cuda_callable__ Loading @@ -114,9 +77,10 @@ struct AtomicOperations< Devices::Cuda > { { #ifdef HAVE_CUDA #ifdef HAVE_CUDA TNL_ASSERT_TRUE( false, "Atomic add for short int is not supported on CUDA." ); TNL_ASSERT_TRUE( false, "Atomic add for short int is not supported on CUDA." ); #endif // HAVE_CUDA #endif return 0; return 0; } } }; }; } // namespace Algorithms } // namespace Algorithms } // namespace TNL } // namespace TNL src/TNL/Atomic.h +25 −4 Original line number Original line Diff line number Diff line Loading @@ -14,11 +14,12 @@ #include <TNL/Devices/Sequential.h> #include <TNL/Devices/Sequential.h> #include <TNL/Devices/Cuda.h> #include <TNL/Devices/Cuda.h> // double-precision atomicAdd function for Maxwell and older GPUs // copied from: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions #ifdef HAVE_CUDA #ifdef HAVE_CUDA #if __CUDA_ARCH__ < 600 namespace { namespace { // double-precision atomicAdd function for Maxwell and older GPUs // copied from: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions #if defined( __CUDA_ARCH__ ) && __CUDA_ARCH__ < 600 __device__ __device__ double double atomicAdd( double* address, double val ) atomicAdd( double* address, double val ) Loading @@ -35,8 +36,28 @@ atomicAdd( double* address, double val ) return __longlong_as_double( old ); return __longlong_as_double( old ); } } } // namespace #endif #endif __device__ long int atomicAdd( long int* address, long int val ) { unsigned long long int* address_as_unsigned = reinterpret_cast< unsigned long long int* >( address ); long int old = *address; long int assumed; do { assumed = old; long int sum = val + assumed; old = atomicCAS( address_as_unsigned, *reinterpret_cast< unsigned long long int* >( &assumed ), *reinterpret_cast< unsigned long long int* >( &sum ) ); } while( assumed != old ); return old; } } // namespace #endif #endif namespace TNL { namespace TNL { Loading Loading
src/TNL/Algorithms/AtomicOperations.h +6 −42 Original line number Original line Diff line number Diff line Loading @@ -8,19 +8,13 @@ #pragma once #pragma once #ifdef HAVE_CUDA #include <TNL/Atomic.h> #include <cuda.h> #endif #include <TNL/Devices/Sequential.h> #include <TNL/Devices/Host.h> #include <TNL/Devices/Cuda.h> namespace TNL { namespace TNL { namespace Algorithms { namespace Algorithms { template< typename Device > template< typename Device > struct AtomicOperations struct AtomicOperations; {}; template<> template<> struct AtomicOperations< Devices::Host > struct AtomicOperations< Devices::Host > Loading Loading @@ -72,40 +66,9 @@ struct AtomicOperations< Devices::Cuda > { { #ifdef HAVE_CUDA #ifdef HAVE_CUDA return atomicAdd( &v, a ); return atomicAdd( &v, a ); #endif #else } #ifdef HAVE_CUDA __device__ static double add( double& v, const double& a ) { #if __CUDA_ARCH__ < 600 unsigned long long int* v_as_ull = (unsigned long long int*) &v; unsigned long long int old = *v_as_ull, assumed; do { assumed = old; old = atomicCAS( v_as_ull, assumed, __double_as_longlong( a + __longlong_as_double( assumed ) ) ); // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) } while( assumed != old ); return old; #else // __CUDA_ARCH__ < 600 return atomicAdd( &v, a ); #endif //__CUDA_ARCH__ < 600 } #endif // HAVE_CUDA __cuda_callable__ static long int add( long int& v, const long int& a ) { #ifdef HAVE_CUDA TNL_ASSERT_TRUE( false, "Atomic add for long int is not supported on CUDA." ); #endif // HAVE_CUDA return 0; return 0; #endif } } __cuda_callable__ __cuda_callable__ Loading @@ -114,9 +77,10 @@ struct AtomicOperations< Devices::Cuda > { { #ifdef HAVE_CUDA #ifdef HAVE_CUDA TNL_ASSERT_TRUE( false, "Atomic add for short int is not supported on CUDA." ); TNL_ASSERT_TRUE( false, "Atomic add for short int is not supported on CUDA." ); #endif // HAVE_CUDA #endif return 0; return 0; } } }; }; } // namespace Algorithms } // namespace Algorithms } // namespace TNL } // namespace TNL
src/TNL/Atomic.h +25 −4 Original line number Original line Diff line number Diff line Loading @@ -14,11 +14,12 @@ #include <TNL/Devices/Sequential.h> #include <TNL/Devices/Sequential.h> #include <TNL/Devices/Cuda.h> #include <TNL/Devices/Cuda.h> // double-precision atomicAdd function for Maxwell and older GPUs // copied from: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions #ifdef HAVE_CUDA #ifdef HAVE_CUDA #if __CUDA_ARCH__ < 600 namespace { namespace { // double-precision atomicAdd function for Maxwell and older GPUs // copied from: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions #if defined( __CUDA_ARCH__ ) && __CUDA_ARCH__ < 600 __device__ __device__ double double atomicAdd( double* address, double val ) atomicAdd( double* address, double val ) Loading @@ -35,8 +36,28 @@ atomicAdd( double* address, double val ) return __longlong_as_double( old ); return __longlong_as_double( old ); } } } // namespace #endif #endif __device__ long int atomicAdd( long int* address, long int val ) { unsigned long long int* address_as_unsigned = reinterpret_cast< unsigned long long int* >( address ); long int old = *address; long int assumed; do { assumed = old; long int sum = val + assumed; old = atomicCAS( address_as_unsigned, *reinterpret_cast< unsigned long long int* >( &assumed ), *reinterpret_cast< unsigned long long int* >( &sum ) ); } while( assumed != old ); return old; } } // namespace #endif #endif namespace TNL { namespace TNL { Loading