Refactored Algorithms/Atomic.h and added atomicAdd for long int via atomicCAS on CUDA

37aab443 · Jakub Klinkovský · bd59c376 · 37aab443 · 37aab443
Commit 37aab443 authored 2 years ago by Jakub Klinkovský
--- a/src/TNL/Algorithms/AtomicOperations.h
+++ b/src/TNL/Algorithms/AtomicOperations.h
@@ -8,19 +8,13 @@

 #pragma once

-#ifdef HAVE_CUDA
-   #include <cuda.h>
-#endif
-#include <TNL/Devices/Sequential.h>
-#include <TNL/Devices/Host.h>
-#include <TNL/Devices/Cuda.h>
+#include <TNL/Atomic.h>

 namespace TNL {
 namespace Algorithms {

 template< typename Device >
-struct AtomicOperations
-{};
+struct AtomicOperations;

 template<>
 struct AtomicOperations< Devices::Host >
@@ -72,40 +66,9 @@ struct AtomicOperations< Devices::Cuda >
   {
 #ifdef HAVE_CUDA
      return atomicAdd( &v, a );
-#endif
-   }
-
-#ifdef HAVE_CUDA
-   __device__
-   static double
-   add( double& v, const double& a )
-   {
-   #if __CUDA_ARCH__ < 600
-      unsigned long long int* v_as_ull = (unsigned long long int*) &v;
-      unsigned long long int old = *v_as_ull, assumed;
-
-      do {
-         assumed = old;
-         old = atomicCAS( v_as_ull, assumed, __double_as_longlong( a + __longlong_as_double( assumed ) ) );
-
-         // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
-      } while( assumed != old );
-
-      return old;
-   #else   // __CUDA_ARCH__ < 600
-      return atomicAdd( &v, a );
-   #endif  //__CUDA_ARCH__ < 600
-   }
-#endif  // HAVE_CUDA
-
-   __cuda_callable__
-   static long int
-   add( long int& v, const long int& a )
-   {
-#ifdef HAVE_CUDA
-      TNL_ASSERT_TRUE( false, "Atomic add for long int is not supported on CUDA." );
-#endif  // HAVE_CUDA
+#else
      return 0;
+#endif
   }

   __cuda_callable__
@@ -114,9 +77,10 @@ struct AtomicOperations< Devices::Cuda >
   {
 #ifdef HAVE_CUDA
      TNL_ASSERT_TRUE( false, "Atomic add for short int is not supported on CUDA." );
-#endif  // HAVE_CUDA
+#endif
      return 0;
   }
 };
+
 }  // namespace Algorithms
 }  // namespace TNL
--- a/src/TNL/Atomic.h
+++ b/src/TNL/Atomic.h
@@ -14,11 +14,12 @@
 #include <TNL/Devices/Sequential.h>
 #include <TNL/Devices/Cuda.h>

-// double-precision atomicAdd function for Maxwell and older GPUs
-// copied from: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions
 #ifdef HAVE_CUDA
-   #if __CUDA_ARCH__ < 600
 namespace {
+
+// double-precision atomicAdd function for Maxwell and older GPUs
+// copied from: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions
+   #if defined( __CUDA_ARCH__ ) && __CUDA_ARCH__ < 600
 __device__
 double
 atomicAdd( double* address, double val )
@@ -35,8 +36,28 @@ atomicAdd( double* address, double val )

   return __longlong_as_double( old );
 }
-}  // namespace
   #endif
+
+__device__
+long int
+atomicAdd( long int* address, long int val )
+{
+   unsigned long long int* address_as_unsigned = reinterpret_cast< unsigned long long int* >( address );
+   long int old = *address;
+   long int assumed;
+
+   do {
+      assumed = old;
+      long int sum = val + assumed;
+      old = atomicCAS( address_as_unsigned,
+                       *reinterpret_cast< unsigned long long int* >( &assumed ),
+                       *reinterpret_cast< unsigned long long int* >( &sum ) );
+   } while( assumed != old );
+
+   return old;
+}
+
+}  // namespace
 #endif

 namespace TNL {