Fixed Cuda::getSharedMemory for CUDA 9 (2b6cb697) · Commits · TNL / tnl-dev

src/TNL/CMakeLists.txt

+1 −0

Original line number	Diff line number	Diff line
		@@ -20,6 +20,7 @@ SET( CURRENT_DIR ${CMAKE_SOURCE_DIR}/src/TNL )
		set( headers
		Assert.h
		Constants.h
		CudaSharedMemory.h
		CudaStreamPool.h
		Curve.h
		DevicePointer.h

src/TNL/CudaSharedMemory.h

0 → 100644

+117 −0

Original line number	Diff line number	Diff line
		/***************************************************************************
		CudaSharedMemory.h - description
		-------------------
		begin : Oct 18, 2017
		copyright : (C) 2017 by Tomas Oberhuber et al.
		email : tomas.oberhuber@fjfi.cvut.cz
		***************************************************************************/

		/* See Copyright Notice in tnl/Copyright */

		#pragma once

		/*
		* Declaration of variables for dynamic shared memory is difficult in templated
		* functions. For example, the following does not work for different types T:
		*
		* template< typename T >
		* void foo()
		* {
		* extern __shared__ T shx[];
		* }
		*
		* This is because extern variables must be declared exactly once. In templated
		* functions we need to have same variable name with different type, which
		* causes the conflict.
		*
		* Until CUDA 8.0, it was possible to use reinterpret_cast this way:
		*
		* template< typename Element, size_t Alignment >
		* __device__ Element* Cuda::getSharedMemory()
		* {
		* extern __shared__ __align__ ( Alignment ) unsigned char __sdata[];
		* return reinterpret_cast< Element* >( __sdata );
		* }
		*
		* But since CUDA 9.0 there is a new restriction that the alignment of the
		* extern variable must be the same in all template instances. Therefore we
		* follow the idea introduced in the CUDA samples, where the problem is solved
		* using template class specializations.
		*/

		#ifdef HAVE_CUDA

		#include <stdint.h>

		namespace TNL {

		template< typename T, std::size_t _alignment = CHAR_BIT * sizeof(T) >
		struct CudaSharedMemory {};

		template< typename T >
		struct CudaSharedMemory< T, 8 >
		{
		__device__ inline operator T* ()
		{
		extern __shared__ uint8_t __smem8[];
		return reinterpret_cast< T* >( __smem8 );
		}

		__device__ inline operator const T* () const
		{
		extern __shared__ uint8_t __smem8[];
		return reinterpret_cast< T* >( __smem8 );
		}
		};

		template< typename T >
		struct CudaSharedMemory< T, 16 >
		{
		__device__ inline operator T* ()
		{
		extern __shared__ uint16_t __smem16[];
		return reinterpret_cast< T* >( __smem16 );
		}

		__device__ inline operator const T* () const
		{
		extern __shared__ uint16_t __smem16[];
		return reinterpret_cast< T* >( __smem16 );
		}
		};

		template< typename T >
		struct CudaSharedMemory< T, 32 >
		{
		__device__ inline operator T* ()
		{
		extern __shared__ uint32_t __smem32[];
		return reinterpret_cast< T* >( __smem32 );
		}

		__device__ inline operator const T* () const
		{
		extern __shared__ uint32_t __smem32[];
		return reinterpret_cast< T* >( __smem32 );
		}
		};

		template< typename T >
		struct CudaSharedMemory< T, 64 >
		{
		__device__ inline operator T* ()
		{
		extern __shared__ uint64_t __smem64[];
		return reinterpret_cast< T* >( __smem64 );
		}

		__device__ inline operator const T* () const
		{
		extern __shared__ uint64_t __smem64[];
		return reinterpret_cast< T* >( __smem64 );
		}
		};

		} // namespace TNL

		#endif

src/TNL/Devices/Cuda.h

+1 −1

Original line number	Diff line number	Diff line
		@@ -143,7 +143,7 @@ class Cuda
		* reinterpret_cast works too.
		* See http://stackoverflow.com/a/19339004/4180822 for reference.
		*/
		template< typename Element, size_t Alignment = sizeof( Element ) >
		template< typename Element >
		static __device__ Element* getSharedMemory();
		#endif

src/TNL/Devices/Cuda_impl.h

+3 −3

Original line number	Diff line number	Diff line
		@@ -13,6 +13,7 @@
		#include <TNL/Devices/Cuda.h>
		#include <TNL/Exceptions/CudaBadAlloc.h>
		#include <TNL/Exceptions/CudaSupportMissing.h>
		#include <TNL/CudaSharedMemory.h>

		namespace TNL {
		namespace Devices {
		@@ -157,11 +158,10 @@ __device__ Index Cuda::getInterleaving( const Index index )
		return index + index / Cuda::getNumberOfSharedMemoryBanks();
		}

		template< typename Element, size_t Alignment >
		template< typename Element >
		__device__ Element* Cuda::getSharedMemory()
		{
		extern __shared__ __align__ ( Alignment ) unsigned char __sdata[];
		return reinterpret_cast< Element* >( __sdata );
		return CudaSharedMemory< Element >();
		}
		#endif /* HAVE_CUDA */