Loading install +1 −0 Original line number Diff line number Diff line Loading @@ -11,6 +11,7 @@ TEMPLATE_EXPLICIT_INSTANTIATION=yes CMAKE="cmake" CPUS=`grep -c processor /proc/cpuinfo` CPUS="1" echo "Building $TARGET using $CPUS processors." if [ ! -d Debug ]; Loading src/core/cuda/cuda-prefix-sum.h +42 −0 Original line number Diff line number Diff line Loading @@ -21,6 +21,46 @@ enum enumPrefixSumType { exclusivePrefixSum = 0, inclusivePrefixSum }; template< typename DataType > class operationSum { public: DataType identity() const { return ( DataType ) 0.0; }; void performInPlace( DataType& a, const DataType& b ) const { a += b; }; DataType perform( const DataType& a, const DataType& b ) { return a + b; }; #ifdef HAVE_CUDA __device__ DataType cudaIdentity() const { return ( DataType ) 0.0; }; __device__ void cudaPerformInPlace( DataType& a, const DataType& b ) const { a += b; }; __device__ DataType cudaPerform( const DataType& a, const DataType& b ) { return a + b; }; #endif }; template< typename DataType, template< typename T > class Operation, typename Index > Loading @@ -31,4 +71,6 @@ bool cudaPrefixSum( const Index size, const enumPrefixSumType prefixSumType = inclusivePrefixSum ); #include <implementation/core/cuda/cuda-prefix-sum_impl.h> #endif /* CUDA_PREFIX_SUM_H_ */ src/core/tnlCuda.h +37 −2 Original line number Diff line number Diff line Loading @@ -42,10 +42,20 @@ class tnlCuda static int getGPUTransferBufferSize(); static bool checkDevice( const char* file_name, int line ); static size_t getFreeMemory(); #ifdef HAVE_CUDA static inline __host__ __device__ int getNumberOfSharedMemoryBanks(); static inline __host__ __device__ int getWarpSize(); template< typename Index > static __device__ Index getInterleaving( const Index index ); #endif static bool checkDevice( const char* file_name, int line ); protected: static int maxGridSize, maxBlockSize; Loading @@ -56,4 +66,29 @@ class tnlCuda #define tnlCudaSupportMissingMessage \ std::cerr << "The CUDA support is missing in the source file " << __FILE__ << " at line " << __LINE__ << ". Please set WITH_CUDA=yes in the install script. " << std::endl; // TODO: This would be nice in tnlCuda but C++ standard does not allow it. #ifdef HAVE_CUDA template< typename Element > struct getSharedMemory { __device__ operator Element*(); }; template<> struct getSharedMemory< double > { inline __device__ operator double*(); }; template<> struct getSharedMemory< long int > { inline __device__ operator long int*(); }; #endif #include <implementation/core/tnlCuda_impl.h> #endif /* TNLCUDA_H_ */ src/core/tnlFile.h +0 −2 Original line number Diff line number Diff line Loading @@ -154,8 +154,6 @@ bool tnlFile :: read( Type* buffer, cerr << "File " << fileName << " was not opened for reading. " << endl; return false; } int bytesRead( 0 ); this->readElements = 0; const Index host_buffer_size = :: Min( ( Index ) ( tnlFileGPUvsCPUTransferBufferSize / sizeof( Type ) ), elements ); Loading src/implementation/core/cuda/cuda-prefix-sum_impl.h +26 −21 Original line number Diff line number Diff line Loading @@ -18,6 +18,8 @@ #ifndef CUDA_PREFIX_SUM_IMPL_H_ #define CUDA_PREFIX_SUM_IMPL_H_ #ifdef HAVE_CUDA template< typename DataType, template< typename T > class Operation, typename Index > Loading @@ -28,8 +30,8 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT DataType* output, DataType* auxArray ) { DataType* sharedData = sharedMemory< DataType >(); DataType* auxData = &sharedData[ elementsInBlock + elementsInBlock / shmBanks + 2 ]; DataType* sharedData = getSharedMemory< DataType >(); DataType* auxData = &sharedData[ elementsInBlock + elementsInBlock / tnlCuda::getNumberOfSharedMemoryBanks() + 2 ]; DataType* warpSums = &auxData[ blockDim. x ]; Operation< DataType > operation; Loading @@ -45,17 +47,17 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT if( prefixSumType == exclusivePrefixSum ) { if( idx == 0 ) sharedData[ interleave< shmBanks >( 0 ) ] = operation. cudaIdentity(); sharedData[ 0 ] = operation. cudaIdentity(); while( idx < elementsInBlock && blockOffset + idx < size ) { sharedData[ interleave< shmBanks >( idx + 1 ) ] = input[ blockOffset + idx ]; sharedData[ tnlCuda::getInterleaving( idx + 1 ) ] = input[ blockOffset + idx ]; idx += blockDim. x; } } else while( idx < elementsInBlock && blockOffset + idx < size ) { sharedData[ interleave< shmBanks >( idx ) ] = input[ blockOffset + idx ]; sharedData[ tnlCuda::getInterleaving( idx ) ] = input[ blockOffset + idx ]; idx += blockDim. x; } Loading @@ -71,30 +73,30 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT if( chunkOffset < lastElementInBlock ) { auxData[ threadIdx. x ] = sharedData[ interleave< shmBanks >( chunkOffset ) ]; sharedData[ tnlCuda::getInterleaving( chunkOffset ) ]; } Index chunkPointer( 1 ); while( chunkPointer < chunkSize && chunkOffset + chunkPointer < lastElementInBlock ) { operation. cudaPerformInPlace( sharedData[ interleave< shmBanks >( chunkOffset + chunkPointer ) ], sharedData[ interleave< shmBanks >( chunkOffset + chunkPointer - 1 ) ] ); operation. cudaPerformInPlace( sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer ) ], sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer - 1 ) ] ); auxData[ threadIdx. x ] = sharedData[ interleave< shmBanks >( chunkOffset + chunkPointer ) ]; sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer ) ]; chunkPointer ++; } /*** * Perform the parallel prefix-sum inside warps. */ const int threadInWarpIdx = threadIdx. x % warpSize; const int warpIdx = threadIdx. x / warpSize; for( int stride = 1; stride < warpSize; stride *= 2 ) const int threadInWarpIdx = threadIdx. x % tnlCuda::getWarpSize(); const int warpIdx = threadIdx. x / tnlCuda::getWarpSize(); for( int stride = 1; stride < tnlCuda::getWarpSize(); stride *= 2 ) if( threadInWarpIdx >= stride && threadIdx. x < numberOfChunks ) operation. cudaPerformInPlace( auxData[ threadIdx. x ], auxData[ threadIdx. x - stride ] ); if( threadInWarpIdx == warpSize - 1 ) if( threadInWarpIdx == tnlCuda::getWarpSize() - 1 ) warpSums[ warpIdx ] = auxData[ threadIdx. x ]; __syncthreads(); Loading @@ -102,7 +104,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT * Compute prefix-sum of warp sums using one warp */ if( warpIdx == 0 ) for( int stride = 1; stride < warpSize; stride *= 2 ) for( int stride = 1; stride < tnlCuda::getWarpSize(); stride *= 2 ) if( threadInWarpIdx >= stride ) operation. cudaPerformInPlace( warpSums[ threadInWarpIdx ], warpSums[ threadInWarpIdx - stride ] ); __syncthreads(); Loading @@ -124,8 +126,8 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT Index chunkShift( operation. cudaIdentity() ); if( chunkIdx > 0 ) chunkShift = auxData[ chunkIdx - 1 ]; operation. cudaPerformInPlace( sharedData[ interleave< shmBanks >( idx ) ], chunkShift ); output[ blockOffset + idx ] = sharedData[ interleave< shmBanks >( idx ) ]; operation. cudaPerformInPlace( sharedData[ tnlCuda::getInterleaving( idx ) ], chunkShift ); output[ blockOffset + idx ] = sharedData[ tnlCuda::getInterleaving( idx ) ]; idx += blockDim. x; } __syncthreads(); Loading @@ -134,10 +136,10 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT { if( prefixSumType == exclusivePrefixSum ) auxArray[ blockIdx. x ] = operation. cudaPerform( sharedData[ interleave< shmBanks >( lastElementInBlock - 1 ) ], sharedData[ interleave< shmBanks >( lastElementInBlock ) ] ); operation. cudaPerform( sharedData[ tnlCuda::getInterleaving( lastElementInBlock - 1 ) ], sharedData[ tnlCuda::getInterleaving( lastElementInBlock ) ] ); else auxArray[ blockIdx. x ] = sharedData[ interleave< shmBanks >( lastElementInBlock - 1 ) ]; auxArray[ blockIdx. x ] = sharedData[ tnlCuda::getInterleaving( lastElementInBlock - 1 ) ]; } } Loading Loading @@ -202,8 +204,9 @@ bool cudaRecursivePrefixSum( const enumPrefixSumType prefixSumType, /**** * Run the kernel. */ size_t sharedDataSize = elementsInBlock + elementsInBlock / shmBanks + 2; size_t sharedMemory = ( sharedDataSize + blockSize + warpSize ) * sizeof( DataType ); size_t sharedDataSize = elementsInBlock + elementsInBlock / tnlCuda::getNumberOfSharedMemoryBanks() + 2; size_t sharedMemory = ( sharedDataSize + blockSize + tnlCuda::getWarpSize() ) * sizeof( DataType ); cudaFirstPhaseBlockPrefixSum< DataType, Operation, Index > <<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( prefixSumType, Loading Loading @@ -332,4 +335,6 @@ bool cudaPrefixSum( const Index size, return true; } #endif #endif /* CUDA_PREFIX_SUM_IMPL_H_ */ Loading
install +1 −0 Original line number Diff line number Diff line Loading @@ -11,6 +11,7 @@ TEMPLATE_EXPLICIT_INSTANTIATION=yes CMAKE="cmake" CPUS=`grep -c processor /proc/cpuinfo` CPUS="1" echo "Building $TARGET using $CPUS processors." if [ ! -d Debug ]; Loading
src/core/cuda/cuda-prefix-sum.h +42 −0 Original line number Diff line number Diff line Loading @@ -21,6 +21,46 @@ enum enumPrefixSumType { exclusivePrefixSum = 0, inclusivePrefixSum }; template< typename DataType > class operationSum { public: DataType identity() const { return ( DataType ) 0.0; }; void performInPlace( DataType& a, const DataType& b ) const { a += b; }; DataType perform( const DataType& a, const DataType& b ) { return a + b; }; #ifdef HAVE_CUDA __device__ DataType cudaIdentity() const { return ( DataType ) 0.0; }; __device__ void cudaPerformInPlace( DataType& a, const DataType& b ) const { a += b; }; __device__ DataType cudaPerform( const DataType& a, const DataType& b ) { return a + b; }; #endif }; template< typename DataType, template< typename T > class Operation, typename Index > Loading @@ -31,4 +71,6 @@ bool cudaPrefixSum( const Index size, const enumPrefixSumType prefixSumType = inclusivePrefixSum ); #include <implementation/core/cuda/cuda-prefix-sum_impl.h> #endif /* CUDA_PREFIX_SUM_H_ */
src/core/tnlCuda.h +37 −2 Original line number Diff line number Diff line Loading @@ -42,10 +42,20 @@ class tnlCuda static int getGPUTransferBufferSize(); static bool checkDevice( const char* file_name, int line ); static size_t getFreeMemory(); #ifdef HAVE_CUDA static inline __host__ __device__ int getNumberOfSharedMemoryBanks(); static inline __host__ __device__ int getWarpSize(); template< typename Index > static __device__ Index getInterleaving( const Index index ); #endif static bool checkDevice( const char* file_name, int line ); protected: static int maxGridSize, maxBlockSize; Loading @@ -56,4 +66,29 @@ class tnlCuda #define tnlCudaSupportMissingMessage \ std::cerr << "The CUDA support is missing in the source file " << __FILE__ << " at line " << __LINE__ << ". Please set WITH_CUDA=yes in the install script. " << std::endl; // TODO: This would be nice in tnlCuda but C++ standard does not allow it. #ifdef HAVE_CUDA template< typename Element > struct getSharedMemory { __device__ operator Element*(); }; template<> struct getSharedMemory< double > { inline __device__ operator double*(); }; template<> struct getSharedMemory< long int > { inline __device__ operator long int*(); }; #endif #include <implementation/core/tnlCuda_impl.h> #endif /* TNLCUDA_H_ */
src/core/tnlFile.h +0 −2 Original line number Diff line number Diff line Loading @@ -154,8 +154,6 @@ bool tnlFile :: read( Type* buffer, cerr << "File " << fileName << " was not opened for reading. " << endl; return false; } int bytesRead( 0 ); this->readElements = 0; const Index host_buffer_size = :: Min( ( Index ) ( tnlFileGPUvsCPUTransferBufferSize / sizeof( Type ) ), elements ); Loading
src/implementation/core/cuda/cuda-prefix-sum_impl.h +26 −21 Original line number Diff line number Diff line Loading @@ -18,6 +18,8 @@ #ifndef CUDA_PREFIX_SUM_IMPL_H_ #define CUDA_PREFIX_SUM_IMPL_H_ #ifdef HAVE_CUDA template< typename DataType, template< typename T > class Operation, typename Index > Loading @@ -28,8 +30,8 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT DataType* output, DataType* auxArray ) { DataType* sharedData = sharedMemory< DataType >(); DataType* auxData = &sharedData[ elementsInBlock + elementsInBlock / shmBanks + 2 ]; DataType* sharedData = getSharedMemory< DataType >(); DataType* auxData = &sharedData[ elementsInBlock + elementsInBlock / tnlCuda::getNumberOfSharedMemoryBanks() + 2 ]; DataType* warpSums = &auxData[ blockDim. x ]; Operation< DataType > operation; Loading @@ -45,17 +47,17 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT if( prefixSumType == exclusivePrefixSum ) { if( idx == 0 ) sharedData[ interleave< shmBanks >( 0 ) ] = operation. cudaIdentity(); sharedData[ 0 ] = operation. cudaIdentity(); while( idx < elementsInBlock && blockOffset + idx < size ) { sharedData[ interleave< shmBanks >( idx + 1 ) ] = input[ blockOffset + idx ]; sharedData[ tnlCuda::getInterleaving( idx + 1 ) ] = input[ blockOffset + idx ]; idx += blockDim. x; } } else while( idx < elementsInBlock && blockOffset + idx < size ) { sharedData[ interleave< shmBanks >( idx ) ] = input[ blockOffset + idx ]; sharedData[ tnlCuda::getInterleaving( idx ) ] = input[ blockOffset + idx ]; idx += blockDim. x; } Loading @@ -71,30 +73,30 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT if( chunkOffset < lastElementInBlock ) { auxData[ threadIdx. x ] = sharedData[ interleave< shmBanks >( chunkOffset ) ]; sharedData[ tnlCuda::getInterleaving( chunkOffset ) ]; } Index chunkPointer( 1 ); while( chunkPointer < chunkSize && chunkOffset + chunkPointer < lastElementInBlock ) { operation. cudaPerformInPlace( sharedData[ interleave< shmBanks >( chunkOffset + chunkPointer ) ], sharedData[ interleave< shmBanks >( chunkOffset + chunkPointer - 1 ) ] ); operation. cudaPerformInPlace( sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer ) ], sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer - 1 ) ] ); auxData[ threadIdx. x ] = sharedData[ interleave< shmBanks >( chunkOffset + chunkPointer ) ]; sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer ) ]; chunkPointer ++; } /*** * Perform the parallel prefix-sum inside warps. */ const int threadInWarpIdx = threadIdx. x % warpSize; const int warpIdx = threadIdx. x / warpSize; for( int stride = 1; stride < warpSize; stride *= 2 ) const int threadInWarpIdx = threadIdx. x % tnlCuda::getWarpSize(); const int warpIdx = threadIdx. x / tnlCuda::getWarpSize(); for( int stride = 1; stride < tnlCuda::getWarpSize(); stride *= 2 ) if( threadInWarpIdx >= stride && threadIdx. x < numberOfChunks ) operation. cudaPerformInPlace( auxData[ threadIdx. x ], auxData[ threadIdx. x - stride ] ); if( threadInWarpIdx == warpSize - 1 ) if( threadInWarpIdx == tnlCuda::getWarpSize() - 1 ) warpSums[ warpIdx ] = auxData[ threadIdx. x ]; __syncthreads(); Loading @@ -102,7 +104,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT * Compute prefix-sum of warp sums using one warp */ if( warpIdx == 0 ) for( int stride = 1; stride < warpSize; stride *= 2 ) for( int stride = 1; stride < tnlCuda::getWarpSize(); stride *= 2 ) if( threadInWarpIdx >= stride ) operation. cudaPerformInPlace( warpSums[ threadInWarpIdx ], warpSums[ threadInWarpIdx - stride ] ); __syncthreads(); Loading @@ -124,8 +126,8 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT Index chunkShift( operation. cudaIdentity() ); if( chunkIdx > 0 ) chunkShift = auxData[ chunkIdx - 1 ]; operation. cudaPerformInPlace( sharedData[ interleave< shmBanks >( idx ) ], chunkShift ); output[ blockOffset + idx ] = sharedData[ interleave< shmBanks >( idx ) ]; operation. cudaPerformInPlace( sharedData[ tnlCuda::getInterleaving( idx ) ], chunkShift ); output[ blockOffset + idx ] = sharedData[ tnlCuda::getInterleaving( idx ) ]; idx += blockDim. x; } __syncthreads(); Loading @@ -134,10 +136,10 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT { if( prefixSumType == exclusivePrefixSum ) auxArray[ blockIdx. x ] = operation. cudaPerform( sharedData[ interleave< shmBanks >( lastElementInBlock - 1 ) ], sharedData[ interleave< shmBanks >( lastElementInBlock ) ] ); operation. cudaPerform( sharedData[ tnlCuda::getInterleaving( lastElementInBlock - 1 ) ], sharedData[ tnlCuda::getInterleaving( lastElementInBlock ) ] ); else auxArray[ blockIdx. x ] = sharedData[ interleave< shmBanks >( lastElementInBlock - 1 ) ]; auxArray[ blockIdx. x ] = sharedData[ tnlCuda::getInterleaving( lastElementInBlock - 1 ) ]; } } Loading Loading @@ -202,8 +204,9 @@ bool cudaRecursivePrefixSum( const enumPrefixSumType prefixSumType, /**** * Run the kernel. */ size_t sharedDataSize = elementsInBlock + elementsInBlock / shmBanks + 2; size_t sharedMemory = ( sharedDataSize + blockSize + warpSize ) * sizeof( DataType ); size_t sharedDataSize = elementsInBlock + elementsInBlock / tnlCuda::getNumberOfSharedMemoryBanks() + 2; size_t sharedMemory = ( sharedDataSize + blockSize + tnlCuda::getWarpSize() ) * sizeof( DataType ); cudaFirstPhaseBlockPrefixSum< DataType, Operation, Index > <<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( prefixSumType, Loading Loading @@ -332,4 +335,6 @@ bool cudaPrefixSum( const Index size, return true; } #endif #endif /* CUDA_PREFIX_SUM_IMPL_H_ */