Commit 2b6cb697 authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Fixed Cuda::getSharedMemory for CUDA 9

parent 54fdf757
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -20,6 +20,7 @@ SET( CURRENT_DIR ${CMAKE_SOURCE_DIR}/src/TNL )
set( headers 
     Assert.h
     Constants.h
     CudaSharedMemory.h
     CudaStreamPool.h
     Curve.h
     DevicePointer.h
+117 −0
Original line number Diff line number Diff line
/***************************************************************************
                          CudaSharedMemory.h  -  description
                             -------------------
    begin                : Oct 18, 2017
    copyright            : (C) 2017 by Tomas Oberhuber et al.
    email                : tomas.oberhuber@fjfi.cvut.cz
 ***************************************************************************/

/* See Copyright Notice in tnl/Copyright */

#pragma once

/*
 * Declaration of variables for dynamic shared memory is difficult in templated
 * functions. For example, the following does not work for different types T:
 *
 *    template< typename T >
 *    void foo()
 *    {
 *        extern __shared__ T shx[];
 *    }
 *
 * This is because extern variables must be declared exactly once. In templated
 * functions we need to have same variable name with different type, which
 * causes the conflict.
 *
 * Until CUDA 8.0, it was possible to use reinterpret_cast this way:
 *
 *    template< typename Element, size_t Alignment >
 *    __device__ Element* Cuda::getSharedMemory()
 *    {
 *       extern __shared__ __align__ ( Alignment ) unsigned char __sdata[];
 *       return reinterpret_cast< Element* >( __sdata );
 *    }
 *
 * But since CUDA 9.0 there is a new restriction that the alignment of the
 * extern variable must be the same in all template instances. Therefore we
 * follow the idea introduced in the CUDA samples, where the problem is solved
 * using template class specializations.
 */

#ifdef HAVE_CUDA

#include <stdint.h>

namespace TNL {

template< typename T, std::size_t _alignment = CHAR_BIT * sizeof(T) >
struct CudaSharedMemory {};

template< typename T >
struct CudaSharedMemory< T, 8 >
{
   __device__ inline operator T* ()
   {
      extern __shared__ uint8_t __smem8[];
      return reinterpret_cast< T* >( __smem8 );
   }

   __device__ inline operator const T* () const
   {
      extern __shared__ uint8_t __smem8[];
      return reinterpret_cast< T* >( __smem8 );
   }
};

template< typename T >
struct CudaSharedMemory< T, 16 >
{
   __device__ inline operator T* ()
   {
      extern __shared__ uint16_t __smem16[];
      return reinterpret_cast< T* >( __smem16 );
   }

   __device__ inline operator const T* () const
   {
      extern __shared__ uint16_t __smem16[];
      return reinterpret_cast< T* >( __smem16 );
   }
};

template< typename T >
struct CudaSharedMemory< T, 32 >
{
   __device__ inline operator T* ()
   {
      extern __shared__ uint32_t __smem32[];
      return reinterpret_cast< T* >( __smem32 );
   }

   __device__ inline operator const T* () const
   {
      extern __shared__ uint32_t __smem32[];
      return reinterpret_cast< T* >( __smem32 );
   }
};

template< typename T >
struct CudaSharedMemory< T, 64 >
{
   __device__ inline operator T* ()
   {
      extern __shared__ uint64_t __smem64[];
      return reinterpret_cast< T* >( __smem64 );
   }

   __device__ inline operator const T* () const
   {
      extern __shared__ uint64_t __smem64[];
      return reinterpret_cast< T* >( __smem64 );
   }
};

} // namespace TNL

#endif
+1 −1
Original line number Diff line number Diff line
@@ -143,7 +143,7 @@ class Cuda
    * reinterpret_cast works too.
    * See http://stackoverflow.com/a/19339004/4180822 for reference.
    */
   template< typename Element, size_t Alignment = sizeof( Element ) >
   template< typename Element >
   static __device__ Element* getSharedMemory();
#endif

+3 −3
Original line number Diff line number Diff line
@@ -13,6 +13,7 @@
#include <TNL/Devices/Cuda.h>
#include <TNL/Exceptions/CudaBadAlloc.h>
#include <TNL/Exceptions/CudaSupportMissing.h>
#include <TNL/CudaSharedMemory.h>

namespace TNL {
namespace Devices {   
@@ -157,11 +158,10 @@ __device__ Index Cuda::getInterleaving( const Index index )
   return index + index / Cuda::getNumberOfSharedMemoryBanks();
}

template< typename Element, size_t Alignment >
template< typename Element >
__device__ Element* Cuda::getSharedMemory()
{
   extern __shared__ __align__ ( Alignment ) unsigned char __sdata[];
   return reinterpret_cast< Element* >( __sdata );
   return CudaSharedMemory< Element >();
}
#endif /* HAVE_CUDA */