Commit c815816e authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Added Devices::Cuda::getSharedMemory method

parent 85212904
Loading
Loading
Loading
Loading
+1 −3
Original line number Diff line number Diff line
@@ -50,9 +50,7 @@ CudaMultireductionKernel( Operation operation,
   typedef typename Operation::IndexType IndexType;
   typedef typename Operation::ResultType ResultType;

   extern __shared__ __align__ ( 8 ) char __sdata[];

   ResultType* sdata = reinterpret_cast< ResultType* >( __sdata );
   ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >();

   /***
    * Get thread id (tid) and global element id (gid).
+1 −3
Original line number Diff line number Diff line
@@ -48,9 +48,7 @@ CudaReductionKernel( Operation operation,
   typedef typename Operation::IndexType IndexType;
   typedef typename Operation::ResultType ResultType;

   extern __shared__ __align__ ( 8 ) char __sdata[];

   ResultType* sdata = reinterpret_cast< ResultType* >( __sdata );
   ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >();

   /***
    * Get thread id (tid) and global thread id (gid).
+1 −1
Original line number Diff line number Diff line
@@ -31,7 +31,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
                                              DataType* output,
                                              DataType* auxArray )
{
   DataType* sharedData = TNL::Devices::getSharedMemory< DataType >();
   DataType* sharedData = TNL::Devices::Cuda::getSharedMemory< DataType >();
   DataType* auxData = &sharedData[ elementsInBlock + elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2 ];
   DataType* warpSums = &auxData[ blockDim. x ];

+21 −23
Original line number Diff line number Diff line
@@ -82,6 +82,27 @@ class Cuda
#ifdef HAVE_CUDA
   template< typename Index >
   static __device__ Index getInterleaving( const Index index );

   /****
    * Declaration of variables for dynamic shared memory is difficult in
    * templated functions. For example, the following does not work for
    * different types T:
    *
    *    template< typename T >
    *    void foo()
    *    {
    *        extern __shared__ T shx[];
    *    }
    *
    * This is because extern variables must be declared exactly once. In
    * templated functions we need to have same variable name with different
    * type, which causes the conflict. In CUDA samples they solve the problem
    * using template specialization via classes, but using one base type and
    * reinterpret_cast works too.
    * See http://stackoverflow.com/a/19339004/4180822 for reference.
    */
   template< typename Element, size_t Alignment = sizeof( Element ) >
   static __device__ Element* getSharedMemory();
#endif

#ifdef HAVE_CUDA
@@ -123,29 +144,6 @@ class Cuda
#define CudaSupportMissingMessage \
   std::cerr << "The CUDA support is missing in the source file " << __FILE__ << " at line " << __LINE__ << ". Please set WITH_CUDA=yes in the install script. " << std::endl;


// TODO: This would be nice in Cuda but C++ standard does not allow it.
#ifdef HAVE_CUDA
   template< typename Element >
   struct getSharedMemory
   {
       __device__ operator Element*();
   };

   template<>
   struct getSharedMemory< double >
   {
       inline __device__ operator double*();
   };

   template<>
   struct getSharedMemory< long int >
   {
       inline __device__ operator long int*();
   };

#endif

} // namespace Devices
} // namespace TNL   
   
+5 −18
Original line number Diff line number Diff line
@@ -140,25 +140,12 @@ __device__ Index Cuda::getInterleaving( const Index index )
   return index + index / Cuda::getNumberOfSharedMemoryBanks();
}

template< typename Element >
__device__ getSharedMemory< Element >::operator Element*()
template< typename Element, size_t Alignment >
__device__ Element* Cuda::getSharedMemory()
{
   extern __shared__ int __sharedMemory[];
   return ( Element* ) __sharedMemory;
};

__device__ inline getSharedMemory< double >::operator double*()
{
   extern __shared__ double __sharedMemoryDouble[];
   return ( double* ) __sharedMemoryDouble;
};

__device__ inline getSharedMemory< long int >::operator long int*()
{
   extern __shared__ long int __sharedMemoryLongInt[];
   return ( long int* ) __sharedMemoryLongInt;
};

   extern __shared__ __align__ ( Alignment ) unsigned char __sdata[];
   return reinterpret_cast< Element* >( __sdata );
}
#endif /* HAVE_CUDA */

} // namespace Devices
Loading