Commit 37aab443 authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Refactored Algorithms/Atomic.h and added atomicAdd for long int via atomicCAS on CUDA

parent bd59c376
Loading
Loading
Loading
Loading
+6 −42
Original line number Original line Diff line number Diff line
@@ -8,19 +8,13 @@


#pragma once
#pragma once


#ifdef HAVE_CUDA
#include <TNL/Atomic.h>
   #include <cuda.h>
#endif
#include <TNL/Devices/Sequential.h>
#include <TNL/Devices/Host.h>
#include <TNL/Devices/Cuda.h>


namespace TNL {
namespace TNL {
namespace Algorithms {
namespace Algorithms {


template< typename Device >
template< typename Device >
struct AtomicOperations
struct AtomicOperations;
{};


template<>
template<>
struct AtomicOperations< Devices::Host >
struct AtomicOperations< Devices::Host >
@@ -72,40 +66,9 @@ struct AtomicOperations< Devices::Cuda >
   {
   {
#ifdef HAVE_CUDA
#ifdef HAVE_CUDA
      return atomicAdd( &v, a );
      return atomicAdd( &v, a );
#endif
#else
   }

#ifdef HAVE_CUDA
   __device__
   static double
   add( double& v, const double& a )
   {
   #if __CUDA_ARCH__ < 600
      unsigned long long int* v_as_ull = (unsigned long long int*) &v;
      unsigned long long int old = *v_as_ull, assumed;

      do {
         assumed = old;
         old = atomicCAS( v_as_ull, assumed, __double_as_longlong( a + __longlong_as_double( assumed ) ) );

         // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
      } while( assumed != old );

      return old;
   #else   // __CUDA_ARCH__ < 600
      return atomicAdd( &v, a );
   #endif  //__CUDA_ARCH__ < 600
   }
#endif  // HAVE_CUDA

   __cuda_callable__
   static long int
   add( long int& v, const long int& a )
   {
#ifdef HAVE_CUDA
      TNL_ASSERT_TRUE( false, "Atomic add for long int is not supported on CUDA." );
#endif  // HAVE_CUDA
      return 0;
      return 0;
#endif
   }
   }


   __cuda_callable__
   __cuda_callable__
@@ -114,9 +77,10 @@ struct AtomicOperations< Devices::Cuda >
   {
   {
#ifdef HAVE_CUDA
#ifdef HAVE_CUDA
      TNL_ASSERT_TRUE( false, "Atomic add for short int is not supported on CUDA." );
      TNL_ASSERT_TRUE( false, "Atomic add for short int is not supported on CUDA." );
#endif  // HAVE_CUDA
#endif
      return 0;
      return 0;
   }
   }
};
};

}  // namespace Algorithms
}  // namespace Algorithms
}  // namespace TNL
}  // namespace TNL
+25 −4
Original line number Original line Diff line number Diff line
@@ -14,11 +14,12 @@
#include <TNL/Devices/Sequential.h>
#include <TNL/Devices/Sequential.h>
#include <TNL/Devices/Cuda.h>
#include <TNL/Devices/Cuda.h>


// double-precision atomicAdd function for Maxwell and older GPUs
// copied from: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions
#ifdef HAVE_CUDA
#ifdef HAVE_CUDA
   #if __CUDA_ARCH__ < 600
namespace {
namespace {

// double-precision atomicAdd function for Maxwell and older GPUs
// copied from: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions
   #if defined( __CUDA_ARCH__ ) && __CUDA_ARCH__ < 600
__device__
__device__
double
double
atomicAdd( double* address, double val )
atomicAdd( double* address, double val )
@@ -35,8 +36,28 @@ atomicAdd( double* address, double val )


   return __longlong_as_double( old );
   return __longlong_as_double( old );
}
}
}  // namespace
   #endif
   #endif

__device__
long int
atomicAdd( long int* address, long int val )
{
   unsigned long long int* address_as_unsigned = reinterpret_cast< unsigned long long int* >( address );
   long int old = *address;
   long int assumed;

   do {
      assumed = old;
      long int sum = val + assumed;
      old = atomicCAS( address_as_unsigned,
                       *reinterpret_cast< unsigned long long int* >( &assumed ),
                       *reinterpret_cast< unsigned long long int* >( &sum ) );
   } while( assumed != old );

   return old;
}

}  // namespace
#endif
#endif


namespace TNL {
namespace TNL {