diff --git a/src/TNL/Algorithms/AtomicOperations.h b/src/TNL/Algorithms/AtomicOperations.h
index db67f5da8f395ef9e92f6a911138b8044a41b74a..6da10e2acab1c1429f854053ce06361bda540726 100644
--- a/src/TNL/Algorithms/AtomicOperations.h
+++ b/src/TNL/Algorithms/AtomicOperations.h
@@ -8,19 +8,13 @@
 
 #pragma once
 
-#ifdef HAVE_CUDA
-   #include <cuda.h>
-#endif
-#include <TNL/Devices/Sequential.h>
-#include <TNL/Devices/Host.h>
-#include <TNL/Devices/Cuda.h>
+#include <TNL/Atomic.h>
 
 namespace TNL {
 namespace Algorithms {
 
 template< typename Device >
-struct AtomicOperations
-{};
+struct AtomicOperations;
 
 template<>
 struct AtomicOperations< Devices::Host >
@@ -72,40 +66,9 @@ struct AtomicOperations< Devices::Cuda >
    {
 #ifdef HAVE_CUDA
       return atomicAdd( &v, a );
-#endif
-   }
-
-#ifdef HAVE_CUDA
-   __device__
-   static double
-   add( double& v, const double& a )
-   {
-   #if __CUDA_ARCH__ < 600
-      unsigned long long int* v_as_ull = (unsigned long long int*) &v;
-      unsigned long long int old = *v_as_ull, assumed;
-
-      do {
-         assumed = old;
-         old = atomicCAS( v_as_ull, assumed, __double_as_longlong( a + __longlong_as_double( assumed ) ) );
-
-         // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
-      } while( assumed != old );
-
-      return old;
-   #else   // __CUDA_ARCH__ < 600
-      return atomicAdd( &v, a );
-   #endif  //__CUDA_ARCH__ < 600
-   }
-#endif  // HAVE_CUDA
-
-   __cuda_callable__
-   static long int
-   add( long int& v, const long int& a )
-   {
-#ifdef HAVE_CUDA
-      TNL_ASSERT_TRUE( false, "Atomic add for long int is not supported on CUDA." );
-#endif  // HAVE_CUDA
+#else
       return 0;
+#endif
    }
 
    __cuda_callable__
@@ -114,9 +77,10 @@ struct AtomicOperations< Devices::Cuda >
    {
 #ifdef HAVE_CUDA
       TNL_ASSERT_TRUE( false, "Atomic add for short int is not supported on CUDA." );
-#endif  // HAVE_CUDA
+#endif
       return 0;
    }
 };
+
 }  // namespace Algorithms
 }  // namespace TNL
diff --git a/src/TNL/Atomic.h b/src/TNL/Atomic.h
index 94ac5dfc8cf4993f35a03f81a5d3e1ccbcd3c344..2f457d74c81f30fcccf8bef5fbe4e4c4a50869ea 100644
--- a/src/TNL/Atomic.h
+++ b/src/TNL/Atomic.h
@@ -14,11 +14,12 @@
 #include <TNL/Devices/Sequential.h>
 #include <TNL/Devices/Cuda.h>
 
-// double-precision atomicAdd function for Maxwell and older GPUs
-// copied from: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions
 #ifdef HAVE_CUDA
-   #if __CUDA_ARCH__ < 600
 namespace {
+
+// double-precision atomicAdd function for Maxwell and older GPUs
+// copied from: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions
+   #if defined( __CUDA_ARCH__ ) && __CUDA_ARCH__ < 600
 __device__
 double
 atomicAdd( double* address, double val )
@@ -35,8 +36,28 @@ atomicAdd( double* address, double val )
 
    return __longlong_as_double( old );
 }
-}  // namespace
    #endif
+
+__device__
+long int
+atomicAdd( long int* address, long int val )
+{
+   unsigned long long int* address_as_unsigned = reinterpret_cast< unsigned long long int* >( address );
+   long int old = *address;
+   long int assumed;
+
+   do {
+      assumed = old;
+      long int sum = val + assumed;
+      old = atomicCAS( address_as_unsigned,
+                       *reinterpret_cast< unsigned long long int* >( &assumed ),
+                       *reinterpret_cast< unsigned long long int* >( &sum ) );
+   } while( assumed != old );
+
+   return old;
+}
+
+}  // namespace
 #endif
 
 namespace TNL {