Loading src/TNL/Containers/Algorithms/CudaMultireductionKernel.h +1 −3 Original line number Diff line number Diff line Loading @@ -50,9 +50,7 @@ CudaMultireductionKernel( Operation operation, typedef typename Operation::IndexType IndexType; typedef typename Operation::ResultType ResultType; extern __shared__ __align__ ( 8 ) char __sdata[]; ResultType* sdata = reinterpret_cast< ResultType* >( __sdata ); ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >(); /*** * Get thread id (tid) and global element id (gid). Loading src/TNL/Containers/Algorithms/CudaReductionKernel.h +1 −3 Original line number Diff line number Diff line Loading @@ -48,9 +48,7 @@ CudaReductionKernel( Operation operation, typedef typename Operation::IndexType IndexType; typedef typename Operation::ResultType ResultType; extern __shared__ __align__ ( 8 ) char __sdata[]; ResultType* sdata = reinterpret_cast< ResultType* >( __sdata ); ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >(); /*** * Get thread id (tid) and global thread id (gid). Loading src/TNL/Containers/Algorithms/cuda-prefix-sum_impl.h +1 −1 Original line number Diff line number Diff line Loading @@ -31,7 +31,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT DataType* output, DataType* auxArray ) { DataType* sharedData = TNL::Devices::getSharedMemory< DataType >(); DataType* sharedData = TNL::Devices::Cuda::getSharedMemory< DataType >(); DataType* auxData = &sharedData[ elementsInBlock + elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2 ]; DataType* warpSums = &auxData[ blockDim. x ]; Loading src/TNL/Devices/Cuda.h +21 −23 Original line number Diff line number Diff line Loading @@ -82,6 +82,27 @@ class Cuda #ifdef HAVE_CUDA template< typename Index > static __device__ Index getInterleaving( const Index index ); /**** * Declaration of variables for dynamic shared memory is difficult in * templated functions. For example, the following does not work for * different types T: * * template< typename T > * void foo() * { * extern __shared__ T shx[]; * } * * This is because extern variables must be declared exactly once. In * templated functions we need to have same variable name with different * type, which causes the conflict. In CUDA samples they solve the problem * using template specialization via classes, but using one base type and * reinterpret_cast works too. * See http://stackoverflow.com/a/19339004/4180822 for reference. */ template< typename Element, size_t Alignment = sizeof( Element ) > static __device__ Element* getSharedMemory(); #endif #ifdef HAVE_CUDA Loading Loading @@ -123,29 +144,6 @@ class Cuda #define CudaSupportMissingMessage \ std::cerr << "The CUDA support is missing in the source file " << __FILE__ << " at line " << __LINE__ << ". Please set WITH_CUDA=yes in the install script. " << std::endl; // TODO: This would be nice in Cuda but C++ standard does not allow it. #ifdef HAVE_CUDA template< typename Element > struct getSharedMemory { __device__ operator Element*(); }; template<> struct getSharedMemory< double > { inline __device__ operator double*(); }; template<> struct getSharedMemory< long int > { inline __device__ operator long int*(); }; #endif } // namespace Devices } // namespace TNL Loading src/TNL/Devices/Cuda_impl.h +5 −18 Original line number Diff line number Diff line Loading @@ -140,25 +140,12 @@ __device__ Index Cuda::getInterleaving( const Index index ) return index + index / Cuda::getNumberOfSharedMemoryBanks(); } template< typename Element > __device__ getSharedMemory< Element >::operator Element*() template< typename Element, size_t Alignment > __device__ Element* Cuda::getSharedMemory() { extern __shared__ int __sharedMemory[]; return ( Element* ) __sharedMemory; }; __device__ inline getSharedMemory< double >::operator double*() { extern __shared__ double __sharedMemoryDouble[]; return ( double* ) __sharedMemoryDouble; }; __device__ inline getSharedMemory< long int >::operator long int*() { extern __shared__ long int __sharedMemoryLongInt[]; return ( long int* ) __sharedMemoryLongInt; }; extern __shared__ __align__ ( Alignment ) unsigned char __sdata[]; return reinterpret_cast< Element* >( __sdata ); } #endif /* HAVE_CUDA */ } // namespace Devices Loading Loading
src/TNL/Containers/Algorithms/CudaMultireductionKernel.h +1 −3 Original line number Diff line number Diff line Loading @@ -50,9 +50,7 @@ CudaMultireductionKernel( Operation operation, typedef typename Operation::IndexType IndexType; typedef typename Operation::ResultType ResultType; extern __shared__ __align__ ( 8 ) char __sdata[]; ResultType* sdata = reinterpret_cast< ResultType* >( __sdata ); ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >(); /*** * Get thread id (tid) and global element id (gid). Loading
src/TNL/Containers/Algorithms/CudaReductionKernel.h +1 −3 Original line number Diff line number Diff line Loading @@ -48,9 +48,7 @@ CudaReductionKernel( Operation operation, typedef typename Operation::IndexType IndexType; typedef typename Operation::ResultType ResultType; extern __shared__ __align__ ( 8 ) char __sdata[]; ResultType* sdata = reinterpret_cast< ResultType* >( __sdata ); ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >(); /*** * Get thread id (tid) and global thread id (gid). Loading
src/TNL/Containers/Algorithms/cuda-prefix-sum_impl.h +1 −1 Original line number Diff line number Diff line Loading @@ -31,7 +31,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT DataType* output, DataType* auxArray ) { DataType* sharedData = TNL::Devices::getSharedMemory< DataType >(); DataType* sharedData = TNL::Devices::Cuda::getSharedMemory< DataType >(); DataType* auxData = &sharedData[ elementsInBlock + elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2 ]; DataType* warpSums = &auxData[ blockDim. x ]; Loading
src/TNL/Devices/Cuda.h +21 −23 Original line number Diff line number Diff line Loading @@ -82,6 +82,27 @@ class Cuda #ifdef HAVE_CUDA template< typename Index > static __device__ Index getInterleaving( const Index index ); /**** * Declaration of variables for dynamic shared memory is difficult in * templated functions. For example, the following does not work for * different types T: * * template< typename T > * void foo() * { * extern __shared__ T shx[]; * } * * This is because extern variables must be declared exactly once. In * templated functions we need to have same variable name with different * type, which causes the conflict. In CUDA samples they solve the problem * using template specialization via classes, but using one base type and * reinterpret_cast works too. * See http://stackoverflow.com/a/19339004/4180822 for reference. */ template< typename Element, size_t Alignment = sizeof( Element ) > static __device__ Element* getSharedMemory(); #endif #ifdef HAVE_CUDA Loading Loading @@ -123,29 +144,6 @@ class Cuda #define CudaSupportMissingMessage \ std::cerr << "The CUDA support is missing in the source file " << __FILE__ << " at line " << __LINE__ << ". Please set WITH_CUDA=yes in the install script. " << std::endl; // TODO: This would be nice in Cuda but C++ standard does not allow it. #ifdef HAVE_CUDA template< typename Element > struct getSharedMemory { __device__ operator Element*(); }; template<> struct getSharedMemory< double > { inline __device__ operator double*(); }; template<> struct getSharedMemory< long int > { inline __device__ operator long int*(); }; #endif } // namespace Devices } // namespace TNL Loading
src/TNL/Devices/Cuda_impl.h +5 −18 Original line number Diff line number Diff line Loading @@ -140,25 +140,12 @@ __device__ Index Cuda::getInterleaving( const Index index ) return index + index / Cuda::getNumberOfSharedMemoryBanks(); } template< typename Element > __device__ getSharedMemory< Element >::operator Element*() template< typename Element, size_t Alignment > __device__ Element* Cuda::getSharedMemory() { extern __shared__ int __sharedMemory[]; return ( Element* ) __sharedMemory; }; __device__ inline getSharedMemory< double >::operator double*() { extern __shared__ double __sharedMemoryDouble[]; return ( double* ) __sharedMemoryDouble; }; __device__ inline getSharedMemory< long int >::operator long int*() { extern __shared__ long int __sharedMemoryLongInt[]; return ( long int* ) __sharedMemoryLongInt; }; extern __shared__ __align__ ( Alignment ) unsigned char __sdata[]; return reinterpret_cast< Element* >( __sdata ); } #endif /* HAVE_CUDA */ } // namespace Devices Loading