From c815816e4c21b00a00ddfa82a21a3839e768ae1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz> Date: Fri, 18 Nov 2016 19:20:24 +0100 Subject: [PATCH] Added Devices::Cuda::getSharedMemory method --- .../Algorithms/CudaMultireductionKernel.h | 4 +- .../Algorithms/CudaReductionKernel.h | 4 +- .../Algorithms/cuda-prefix-sum_impl.h | 2 +- src/TNL/Devices/Cuda.h | 44 +++++++++---------- src/TNL/Devices/Cuda_impl.h | 23 +++------- src/TNL/Matrices/CSR_impl.h | 2 +- src/TNL/Matrices/ChunkedEllpack_impl.h | 2 +- src/TNL/Matrices/MatrixOperations.h | 11 +---- 8 files changed, 32 insertions(+), 60 deletions(-) diff --git a/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h b/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h index e1dd4c36ef..ae0ee698ca 100644 --- a/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h +++ b/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h @@ -50,9 +50,7 @@ CudaMultireductionKernel( Operation operation, typedef typename Operation::IndexType IndexType; typedef typename Operation::ResultType ResultType; - extern __shared__ __align__ ( 8 ) char __sdata[]; - - ResultType* sdata = reinterpret_cast< ResultType* >( __sdata ); + ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >(); /*** * Get thread id (tid) and global element id (gid). diff --git a/src/TNL/Containers/Algorithms/CudaReductionKernel.h b/src/TNL/Containers/Algorithms/CudaReductionKernel.h index 607779eff4..c6d7d49eca 100644 --- a/src/TNL/Containers/Algorithms/CudaReductionKernel.h +++ b/src/TNL/Containers/Algorithms/CudaReductionKernel.h @@ -48,9 +48,7 @@ CudaReductionKernel( Operation operation, typedef typename Operation::IndexType IndexType; typedef typename Operation::ResultType ResultType; - extern __shared__ __align__ ( 8 ) char __sdata[]; - - ResultType* sdata = reinterpret_cast< ResultType* >( __sdata ); + ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >(); /*** * Get thread id (tid) and global thread id (gid). diff --git a/src/TNL/Containers/Algorithms/cuda-prefix-sum_impl.h b/src/TNL/Containers/Algorithms/cuda-prefix-sum_impl.h index d0591050d5..971544a024 100644 --- a/src/TNL/Containers/Algorithms/cuda-prefix-sum_impl.h +++ b/src/TNL/Containers/Algorithms/cuda-prefix-sum_impl.h @@ -31,7 +31,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT DataType* output, DataType* auxArray ) { - DataType* sharedData = TNL::Devices::getSharedMemory< DataType >(); + DataType* sharedData = TNL::Devices::Cuda::getSharedMemory< DataType >(); DataType* auxData = &sharedData[ elementsInBlock + elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2 ]; DataType* warpSums = &auxData[ blockDim. x ]; diff --git a/src/TNL/Devices/Cuda.h b/src/TNL/Devices/Cuda.h index 301b009163..3a8f6a3dd5 100644 --- a/src/TNL/Devices/Cuda.h +++ b/src/TNL/Devices/Cuda.h @@ -82,6 +82,27 @@ class Cuda #ifdef HAVE_CUDA template< typename Index > static __device__ Index getInterleaving( const Index index ); + + /**** + * Declaration of variables for dynamic shared memory is difficult in + * templated functions. For example, the following does not work for + * different types T: + * + * template< typename T > + * void foo() + * { + * extern __shared__ T shx[]; + * } + * + * This is because extern variables must be declared exactly once. In + * templated functions we need to have same variable name with different + * type, which causes the conflict. In CUDA samples they solve the problem + * using template specialization via classes, but using one base type and + * reinterpret_cast works too. + * See http://stackoverflow.com/a/19339004/4180822 for reference. + */ + template< typename Element, size_t Alignment = sizeof( Element ) > + static __device__ Element* getSharedMemory(); #endif #ifdef HAVE_CUDA @@ -123,29 +144,6 @@ class Cuda #define CudaSupportMissingMessage \ std::cerr << "The CUDA support is missing in the source file " << __FILE__ << " at line " << __LINE__ << ". Please set WITH_CUDA=yes in the install script. " << std::endl; - -// TODO: This would be nice in Cuda but C++ standard does not allow it. -#ifdef HAVE_CUDA - template< typename Element > - struct getSharedMemory - { - __device__ operator Element*(); - }; - - template<> - struct getSharedMemory< double > - { - inline __device__ operator double*(); - }; - - template<> - struct getSharedMemory< long int > - { - inline __device__ operator long int*(); - }; - -#endif - } // namespace Devices } // namespace TNL diff --git a/src/TNL/Devices/Cuda_impl.h b/src/TNL/Devices/Cuda_impl.h index 9c0d252a5a..580653b708 100644 --- a/src/TNL/Devices/Cuda_impl.h +++ b/src/TNL/Devices/Cuda_impl.h @@ -140,25 +140,12 @@ __device__ Index Cuda::getInterleaving( const Index index ) return index + index / Cuda::getNumberOfSharedMemoryBanks(); } -template< typename Element > -__device__ getSharedMemory< Element >::operator Element*() +template< typename Element, size_t Alignment > +__device__ Element* Cuda::getSharedMemory() { - extern __shared__ int __sharedMemory[]; - return ( Element* ) __sharedMemory; -}; - -__device__ inline getSharedMemory< double >::operator double*() -{ - extern __shared__ double __sharedMemoryDouble[]; - return ( double* ) __sharedMemoryDouble; -}; - -__device__ inline getSharedMemory< long int >::operator long int*() -{ - extern __shared__ long int __sharedMemoryLongInt[]; - return ( long int* ) __sharedMemoryLongInt; -}; - + extern __shared__ __align__ ( Alignment ) unsigned char __sdata[]; + return reinterpret_cast< Element* >( __sdata ); +} #endif /* HAVE_CUDA */ } // namespace Devices diff --git a/src/TNL/Matrices/CSR_impl.h b/src/TNL/Matrices/CSR_impl.h index 0d7261a573..d32355c732 100644 --- a/src/TNL/Matrices/CSR_impl.h +++ b/src/TNL/Matrices/CSR_impl.h @@ -629,7 +629,7 @@ void CSR< Real, Device, Index >::spmvCudaVectorized( const InVector& inVector, const IndexType warpEnd, const IndexType inWarpIdx ) const { - volatile Real* aux = Devices::getSharedMemory< Real >(); + volatile Real* aux = Devices::Cuda::getSharedMemory< Real >(); for( IndexType row = warpStart; row < warpEnd; row++ ) { aux[ threadIdx.x ] = 0.0; diff --git a/src/TNL/Matrices/ChunkedEllpack_impl.h b/src/TNL/Matrices/ChunkedEllpack_impl.h index 325ae861ab..aed5b5eded 100644 --- a/src/TNL/Matrices/ChunkedEllpack_impl.h +++ b/src/TNL/Matrices/ChunkedEllpack_impl.h @@ -1068,7 +1068,7 @@ __device__ void ChunkedEllpack< Real, Device, Index >::computeSliceVectorProduct { static_assert( std::is_same < DeviceType, Devices::Cuda >::value, "" ); - RealType* chunkProducts = Devices::getSharedMemory< RealType >(); + RealType* chunkProducts = Devices::Cuda::getSharedMemory< RealType >(); ChunkedEllpackSliceInfo* sliceInfo = ( ChunkedEllpackSliceInfo* ) & chunkProducts[ blockDim.x ]; if( threadIdx.x == 0 ) diff --git a/src/TNL/Matrices/MatrixOperations.h b/src/TNL/Matrices/MatrixOperations.h index 4cecc84259..f6d6decf1d 100644 --- a/src/TNL/Matrices/MatrixOperations.h +++ b/src/TNL/Matrices/MatrixOperations.h @@ -90,16 +90,7 @@ GemvCudaKernel( const IndexType m, IndexType elementIdx = blockIdx.x * blockDim.x + threadIdx.x; const IndexType gridSize = blockDim.x * gridDim.x; - // NOTE: Plain declaration such as - // extern __shared__ RealType shx[]; - // won't work because extern variables must be declared exactly once. - // In templated functions we need to have same variable name with - // different type, which causes the conflict. In CUDA samples they - // solve it using template specialization via classes, but using char - // as the base type and reinterpret_cast works too. - // See http://stackoverflow.com/a/19339004/4180822 - extern __shared__ __align__ ( 8 ) char __sdata[]; - RealType* shx = reinterpret_cast< RealType* >( __sdata ); + RealType* shx = Devices::Cuda::getSharedMemory< RealType >(); if( threadIdx.x < n ) shx[ threadIdx.x ] = x[ threadIdx.x ]; -- GitLab