Added Devices::Cuda::getSharedMemory method (c815816e) · Commits · TNL / tnl-dev

src/TNL/Containers/Algorithms/CudaMultireductionKernel.h

+1 −3

Original line number	Diff line number	Diff line
		@@ -50,9 +50,7 @@ CudaMultireductionKernel( Operation operation,
		typedef typename Operation::IndexType IndexType;
		typedef typename Operation::ResultType ResultType;

		extern __shared__ __align__ ( 8 ) char __sdata[];

		ResultType* sdata = reinterpret_cast< ResultType* >( __sdata );
		ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >();

		/***
		* Get thread id (tid) and global element id (gid).

src/TNL/Containers/Algorithms/CudaReductionKernel.h

+1 −3

Original line number	Diff line number	Diff line
		@@ -48,9 +48,7 @@ CudaReductionKernel( Operation operation,
		typedef typename Operation::IndexType IndexType;
		typedef typename Operation::ResultType ResultType;

		extern __shared__ __align__ ( 8 ) char __sdata[];

		ResultType* sdata = reinterpret_cast< ResultType* >( __sdata );
		ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >();

		/***
		* Get thread id (tid) and global thread id (gid).

src/TNL/Containers/Algorithms/cuda-prefix-sum_impl.h

+1 −1

Original line number	Diff line number	Diff line
		@@ -31,7 +31,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
		DataType* output,
		DataType* auxArray )
		{
		DataType* sharedData = TNL::Devices::getSharedMemory< DataType >();
		DataType* sharedData = TNL::Devices::Cuda::getSharedMemory< DataType >();
		DataType* auxData = &sharedData[ elementsInBlock + elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2 ];
		DataType* warpSums = &auxData[ blockDim. x ];

src/TNL/Devices/Cuda.h

+21 −23

Original line number	Diff line number	Diff line
		@@ -82,6 +82,27 @@ class Cuda
		#ifdef HAVE_CUDA
		template< typename Index >
		static __device__ Index getInterleaving( const Index index );

		/****
		* Declaration of variables for dynamic shared memory is difficult in
		* templated functions. For example, the following does not work for
		* different types T:
		*
		* template< typename T >
		* void foo()
		* {
		* extern __shared__ T shx[];
		* }
		*
		* This is because extern variables must be declared exactly once. In
		* templated functions we need to have same variable name with different
		* type, which causes the conflict. In CUDA samples they solve the problem
		* using template specialization via classes, but using one base type and
		* reinterpret_cast works too.
		* See http://stackoverflow.com/a/19339004/4180822 for reference.
		*/
		template< typename Element, size_t Alignment = sizeof( Element ) >
		static __device__ Element* getSharedMemory();
		#endif

		#ifdef HAVE_CUDA
		@@ -123,29 +144,6 @@ class Cuda
		#define CudaSupportMissingMessage \
		std::cerr << "The CUDA support is missing in the source file " << __FILE__ << " at line " << __LINE__ << ". Please set WITH_CUDA=yes in the install script. " << std::endl;


		// TODO: This would be nice in Cuda but C++ standard does not allow it.
		#ifdef HAVE_CUDA
		template< typename Element >
		struct getSharedMemory
		{
		__device__ operator Element*();
		};

		template<>
		struct getSharedMemory< double >
		{
		inline __device__ operator double*();
		};

		template<>
		struct getSharedMemory< long int >
		{
		inline __device__ operator long int*();
		};

		#endif

		} // namespace Devices
		} // namespace TNL

src/TNL/Devices/Cuda_impl.h

+5 −18

Original line number	Diff line number	Diff line
		@@ -140,25 +140,12 @@ __device__ Index Cuda::getInterleaving( const Index index )
		return index + index / Cuda::getNumberOfSharedMemoryBanks();
		}

		template< typename Element >
		__device__ getSharedMemory< Element >::operator Element*()
		template< typename Element, size_t Alignment >
		__device__ Element* Cuda::getSharedMemory()
		{
		extern __shared__ int __sharedMemory[];
		return ( Element* ) __sharedMemory;
		};

		__device__ inline getSharedMemory< double >::operator double*()
		{
		extern __shared__ double __sharedMemoryDouble[];
		return ( double* ) __sharedMemoryDouble;
		};

		__device__ inline getSharedMemory< long int >::operator long int*()
		{
		extern __shared__ long int __sharedMemoryLongInt[];
		return ( long int* ) __sharedMemoryLongInt;
		};

		extern __shared__ __align__ ( Alignment ) unsigned char __sdata[];
		return reinterpret_cast< Element* >( __sdata );
		}
		#endif /* HAVE_CUDA */

		} // namespace Devices