Refactored Algorithms/Atomic.h and added atomicAdd for long int via atomicCAS on CUDA (37aab443) · Commits · TNL / tnl-dev

src/TNL/Algorithms/AtomicOperations.h

+6 −42

Original line number	Original line	Diff line number	Diff line
	@@ -8,19 +8,13 @@

	#pragma once		#pragma once

	#ifdef HAVE_CUDA		#include <TNL/Atomic.h>
	#include <cuda.h>
	#endif
	#include <TNL/Devices/Sequential.h>
	#include <TNL/Devices/Host.h>
	#include <TNL/Devices/Cuda.h>

	namespace TNL {		namespace TNL {
	namespace Algorithms {		namespace Algorithms {

	template< typename Device >		template< typename Device >
	struct AtomicOperations		struct AtomicOperations;
	{};

	template<>		template<>
	struct AtomicOperations< Devices::Host >		struct AtomicOperations< Devices::Host >
	@@ -72,40 +66,9 @@ struct AtomicOperations< Devices::Cuda >
	{		{
	#ifdef HAVE_CUDA		#ifdef HAVE_CUDA
	return atomicAdd( &v, a );		return atomicAdd( &v, a );
	#endif		#else
	}

	#ifdef HAVE_CUDA
	__device__
	static double
	add( double& v, const double& a )
	{
	#if __CUDA_ARCH__ < 600
	unsigned long long int* v_as_ull = (unsigned long long int*) &v;
	unsigned long long int old = *v_as_ull, assumed;

	do {
	assumed = old;
	old = atomicCAS( v_as_ull, assumed, __double_as_longlong( a + __longlong_as_double( assumed ) ) );

	// Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
	} while( assumed != old );

	return old;
	#else // __CUDA_ARCH__ < 600
	return atomicAdd( &v, a );
	#endif //__CUDA_ARCH__ < 600
	}
	#endif // HAVE_CUDA

	__cuda_callable__
	static long int
	add( long int& v, const long int& a )
	{
	#ifdef HAVE_CUDA
	TNL_ASSERT_TRUE( false, "Atomic add for long int is not supported on CUDA." );
	#endif // HAVE_CUDA
	return 0;		return 0;
			#endif
	}		}

	__cuda_callable__		__cuda_callable__
	@@ -114,9 +77,10 @@ struct AtomicOperations< Devices::Cuda >
	{		{
	#ifdef HAVE_CUDA		#ifdef HAVE_CUDA
	TNL_ASSERT_TRUE( false, "Atomic add for short int is not supported on CUDA." );		TNL_ASSERT_TRUE( false, "Atomic add for short int is not supported on CUDA." );
	#endif // HAVE_CUDA		#endif
	return 0;		return 0;
	}		}
	};		};

	} // namespace Algorithms		} // namespace Algorithms
	} // namespace TNL		} // namespace TNL

src/TNL/Atomic.h

+25 −4

Original line number	Original line	Diff line number	Diff line
	@@ -14,11 +14,12 @@
	#include <TNL/Devices/Sequential.h>		#include <TNL/Devices/Sequential.h>
	#include <TNL/Devices/Cuda.h>		#include <TNL/Devices/Cuda.h>

	// double-precision atomicAdd function for Maxwell and older GPUs
	// copied from: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions
	#ifdef HAVE_CUDA		#ifdef HAVE_CUDA
	#if __CUDA_ARCH__ < 600
	namespace {		namespace {

			// double-precision atomicAdd function for Maxwell and older GPUs
			// copied from: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions
			#if defined( __CUDA_ARCH__ ) && __CUDA_ARCH__ < 600
	__device__		__device__
	double		double
	atomicAdd( double* address, double val )		atomicAdd( double* address, double val )
	@@ -35,8 +36,28 @@ atomicAdd( double* address, double val )

	return __longlong_as_double( old );		return __longlong_as_double( old );
	}		}
	} // namespace
	#endif		#endif

			__device__
			long int
			atomicAdd( long int* address, long int val )
			{
			unsigned long long int* address_as_unsigned = reinterpret_cast< unsigned long long int* >( address );
			long int old = *address;
			long int assumed;

			do {
			assumed = old;
			long int sum = val + assumed;
			old = atomicCAS( address_as_unsigned,
			reinterpret_cast< unsigned long long int >( &assumed ),
			reinterpret_cast< unsigned long long int >( &sum ) );
			} while( assumed != old );

			return old;
			}

			} // namespace
	#endif		#endif

	namespace TNL {		namespace TNL {