From c815816e4c21b00a00ddfa82a21a3839e768ae1e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Fri, 18 Nov 2016 19:20:24 +0100
Subject: [PATCH] Added Devices::Cuda::getSharedMemory method

---
 .../Algorithms/CudaMultireductionKernel.h     |  4 +-
 .../Algorithms/CudaReductionKernel.h          |  4 +-
 .../Algorithms/cuda-prefix-sum_impl.h         |  2 +-
 src/TNL/Devices/Cuda.h                        | 44 +++++++++----------
 src/TNL/Devices/Cuda_impl.h                   | 23 +++-------
 src/TNL/Matrices/CSR_impl.h                   |  2 +-
 src/TNL/Matrices/ChunkedEllpack_impl.h        |  2 +-
 src/TNL/Matrices/MatrixOperations.h           | 11 +----
 8 files changed, 32 insertions(+), 60 deletions(-)

diff --git a/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h b/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h
index e1dd4c36ef..ae0ee698ca 100644
--- a/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h
+++ b/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h
@@ -50,9 +50,7 @@ CudaMultireductionKernel( Operation operation,
    typedef typename Operation::IndexType IndexType;
    typedef typename Operation::ResultType ResultType;
 
-   extern __shared__ __align__ ( 8 ) char __sdata[];
-
-   ResultType* sdata = reinterpret_cast< ResultType* >( __sdata );
+   ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >();
 
    /***
     * Get thread id (tid) and global element id (gid).
diff --git a/src/TNL/Containers/Algorithms/CudaReductionKernel.h b/src/TNL/Containers/Algorithms/CudaReductionKernel.h
index 607779eff4..c6d7d49eca 100644
--- a/src/TNL/Containers/Algorithms/CudaReductionKernel.h
+++ b/src/TNL/Containers/Algorithms/CudaReductionKernel.h
@@ -48,9 +48,7 @@ CudaReductionKernel( Operation operation,
    typedef typename Operation::IndexType IndexType;
    typedef typename Operation::ResultType ResultType;
 
-   extern __shared__ __align__ ( 8 ) char __sdata[];
-
-   ResultType* sdata = reinterpret_cast< ResultType* >( __sdata );
+   ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >();
 
    /***
     * Get thread id (tid) and global thread id (gid).
diff --git a/src/TNL/Containers/Algorithms/cuda-prefix-sum_impl.h b/src/TNL/Containers/Algorithms/cuda-prefix-sum_impl.h
index d0591050d5..971544a024 100644
--- a/src/TNL/Containers/Algorithms/cuda-prefix-sum_impl.h
+++ b/src/TNL/Containers/Algorithms/cuda-prefix-sum_impl.h
@@ -31,7 +31,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
                                               DataType* output,
                                               DataType* auxArray )
 {
-   DataType* sharedData = TNL::Devices::getSharedMemory< DataType >();
+   DataType* sharedData = TNL::Devices::Cuda::getSharedMemory< DataType >();
    DataType* auxData = &sharedData[ elementsInBlock + elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2 ];
    DataType* warpSums = &auxData[ blockDim. x ];
 
diff --git a/src/TNL/Devices/Cuda.h b/src/TNL/Devices/Cuda.h
index 301b009163..3a8f6a3dd5 100644
--- a/src/TNL/Devices/Cuda.h
+++ b/src/TNL/Devices/Cuda.h
@@ -82,6 +82,27 @@ class Cuda
 #ifdef HAVE_CUDA
    template< typename Index >
    static __device__ Index getInterleaving( const Index index );
+
+   /****
+    * Declaration of variables for dynamic shared memory is difficult in
+    * templated functions. For example, the following does not work for
+    * different types T:
+    *
+    *    template< typename T >
+    *    void foo()
+    *    {
+    *        extern __shared__ T shx[];
+    *    }
+    *
+    * This is because extern variables must be declared exactly once. In
+    * templated functions we need to have same variable name with different
+    * type, which causes the conflict. In CUDA samples they solve the problem
+    * using template specialization via classes, but using one base type and
+    * reinterpret_cast works too.
+    * See http://stackoverflow.com/a/19339004/4180822 for reference.
+    */
+   template< typename Element, size_t Alignment = sizeof( Element ) >
+   static __device__ Element* getSharedMemory();
 #endif
 
 #ifdef HAVE_CUDA
@@ -123,29 +144,6 @@ class Cuda
 #define CudaSupportMissingMessage \
    std::cerr << "The CUDA support is missing in the source file " << __FILE__ << " at line " << __LINE__ << ". Please set WITH_CUDA=yes in the install script. " << std::endl;
 
-
-// TODO: This would be nice in Cuda but C++ standard does not allow it.
-#ifdef HAVE_CUDA
-   template< typename Element >
-   struct getSharedMemory
-   {
-       __device__ operator Element*();
-   };
-
-   template<>
-   struct getSharedMemory< double >
-   {
-       inline __device__ operator double*();
-   };
-
-   template<>
-   struct getSharedMemory< long int >
-   {
-       inline __device__ operator long int*();
-   };
-
-#endif
-
 } // namespace Devices
 } // namespace TNL   
    
diff --git a/src/TNL/Devices/Cuda_impl.h b/src/TNL/Devices/Cuda_impl.h
index 9c0d252a5a..580653b708 100644
--- a/src/TNL/Devices/Cuda_impl.h
+++ b/src/TNL/Devices/Cuda_impl.h
@@ -140,25 +140,12 @@ __device__ Index Cuda::getInterleaving( const Index index )
    return index + index / Cuda::getNumberOfSharedMemoryBanks();
 }
 
-template< typename Element >
-__device__ getSharedMemory< Element >::operator Element*()
+template< typename Element, size_t Alignment >
+__device__ Element* Cuda::getSharedMemory()
 {
-   extern __shared__ int __sharedMemory[];
-   return ( Element* ) __sharedMemory;
-};
-
-__device__ inline getSharedMemory< double >::operator double*()
-{
-   extern __shared__ double __sharedMemoryDouble[];
-   return ( double* ) __sharedMemoryDouble;
-};
-
-__device__ inline getSharedMemory< long int >::operator long int*()
-{
-   extern __shared__ long int __sharedMemoryLongInt[];
-   return ( long int* ) __sharedMemoryLongInt;
-};
-
+   extern __shared__ __align__ ( Alignment ) unsigned char __sdata[];
+   return reinterpret_cast< Element* >( __sdata );
+}
 #endif /* HAVE_CUDA */
 
 } // namespace Devices
diff --git a/src/TNL/Matrices/CSR_impl.h b/src/TNL/Matrices/CSR_impl.h
index 0d7261a573..d32355c732 100644
--- a/src/TNL/Matrices/CSR_impl.h
+++ b/src/TNL/Matrices/CSR_impl.h
@@ -629,7 +629,7 @@ void CSR< Real, Device, Index >::spmvCudaVectorized( const InVector& inVector,
                                                               const IndexType warpEnd,
                                                               const IndexType inWarpIdx ) const
 {
-   volatile Real* aux = Devices::getSharedMemory< Real >();
+   volatile Real* aux = Devices::Cuda::getSharedMemory< Real >();
    for( IndexType row = warpStart; row < warpEnd; row++ )
    {
       aux[ threadIdx.x ] = 0.0;
diff --git a/src/TNL/Matrices/ChunkedEllpack_impl.h b/src/TNL/Matrices/ChunkedEllpack_impl.h
index 325ae861ab..aed5b5eded 100644
--- a/src/TNL/Matrices/ChunkedEllpack_impl.h
+++ b/src/TNL/Matrices/ChunkedEllpack_impl.h
@@ -1068,7 +1068,7 @@ __device__ void ChunkedEllpack< Real, Device, Index >::computeSliceVectorProduct
 {
    static_assert( std::is_same < DeviceType, Devices::Cuda >::value, "" );
 
-   RealType* chunkProducts = Devices::getSharedMemory< RealType >();
+   RealType* chunkProducts = Devices::Cuda::getSharedMemory< RealType >();
    ChunkedEllpackSliceInfo* sliceInfo = ( ChunkedEllpackSliceInfo* ) & chunkProducts[ blockDim.x ];
 
    if( threadIdx.x == 0 )
diff --git a/src/TNL/Matrices/MatrixOperations.h b/src/TNL/Matrices/MatrixOperations.h
index 4cecc84259..f6d6decf1d 100644
--- a/src/TNL/Matrices/MatrixOperations.h
+++ b/src/TNL/Matrices/MatrixOperations.h
@@ -90,16 +90,7 @@ GemvCudaKernel( const IndexType m,
    IndexType elementIdx = blockIdx.x * blockDim.x + threadIdx.x;
    const IndexType gridSize = blockDim.x * gridDim.x;
 
-   // NOTE: Plain declaration such as
-   //          extern __shared__ RealType shx[];
-   //       won't work because extern variables must be declared exactly once.
-   //       In templated functions we need to have same variable name with
-   //       different type, which causes the conflict. In CUDA samples they
-   //       solve it using template specialization via classes, but using char
-   //       as the base type and reinterpret_cast works too.
-   //       See http://stackoverflow.com/a/19339004/4180822
-   extern __shared__ __align__ ( 8 ) char __sdata[];
-   RealType* shx = reinterpret_cast< RealType* >( __sdata );
+   RealType* shx = Devices::Cuda::getSharedMemory< RealType >();
 
    if( threadIdx.x < n )
       shx[ threadIdx.x ] = x[ threadIdx.x ];
-- 
GitLab