Loading src/core/cuda/cuda-prefix-sum.h +4 −3 Original line number Diff line number Diff line Loading @@ -21,7 +21,7 @@ enum enumPrefixSumType { exclusivePrefixSum = 0, inclusivePrefixSum }; template< typename DataType > /*template< typename DataType > class operationSum { public: Loading Loading @@ -58,16 +58,17 @@ class operationSum }; #endif }; };*/ template< typename DataType, template< typename T > class Operation, typename Operation, typename Index > bool cudaPrefixSum( const Index size, const Index blockSize, const DataType *deviceInput, DataType* deviceOutput, const Operation& operation, const enumPrefixSumType prefixSumType = inclusivePrefixSum ); Loading src/core/cuda/reduction-operations.h +19 −0 Original line number Diff line number Diff line Loading @@ -176,6 +176,25 @@ class tnlParallelReductionSum { return data[ idx1 ] + data[ idx2 ]; }; __device__ ResultType commonReductionOnDevice( const ResultType& a, const ResultType& b ) const { return a + b; }; __device__ RealType identity() const { return 0; }; __device__ void performInPlace( ResultType& a, const ResultType& b ) { a += b; } #endif }; Loading src/implementation/core/cuda/cuda-prefix-sum_impl.h +13 −11 Original line number Diff line number Diff line Loading @@ -47,7 +47,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT if( prefixSumType == exclusivePrefixSum ) { if( idx == 0 ) sharedData[ 0 ] = operation. cudaIdentity(); sharedData[ 0 ] = operation.identity(); while( idx < elementsInBlock && blockOffset + idx < size ) { sharedData[ tnlCuda::getInterleaving( idx + 1 ) ] = input[ blockOffset + idx ]; Loading Loading @@ -80,7 +80,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT while( chunkPointer < chunkSize && chunkOffset + chunkPointer < lastElementInBlock ) { operation. cudaPerformInPlace( sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer ) ], operation.performInPlace( sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer ) ], sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer - 1 ) ] ); auxData[ threadIdx. x ] = sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer ) ]; Loading @@ -94,7 +94,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT const int warpIdx = threadIdx. x / tnlCuda::getWarpSize(); for( int stride = 1; stride < tnlCuda::getWarpSize(); stride *= 2 ) if( threadInWarpIdx >= stride && threadIdx. x < numberOfChunks ) operation. cudaPerformInPlace( auxData[ threadIdx. x ], auxData[ threadIdx. x - stride ] ); operation.performInPlace( auxData[ threadIdx. x ], auxData[ threadIdx. x - stride ] ); if( threadInWarpIdx == tnlCuda::getWarpSize() - 1 ) warpSums[ warpIdx ] = auxData[ threadIdx. x ]; Loading @@ -106,14 +106,14 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT if( warpIdx == 0 ) for( int stride = 1; stride < tnlCuda::getWarpSize(); stride *= 2 ) if( threadInWarpIdx >= stride ) operation. cudaPerformInPlace( warpSums[ threadInWarpIdx ], warpSums[ threadInWarpIdx - stride ] ); operation.performInPlace( warpSums[ threadInWarpIdx ], warpSums[ threadInWarpIdx - stride ] ); __syncthreads(); /**** * Shift the warp prefix-sums. */ if( warpIdx > 0 ) operation. cudaPerformInPlace( auxData[ threadIdx. x ], warpSums[ warpIdx - 1 ] ); operation.performInPlace( auxData[ threadIdx. x ], warpSums[ warpIdx - 1 ] ); /*** * Store the result back in global memory. Loading @@ -126,7 +126,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT Index chunkShift( operation. cudaIdentity() ); if( chunkIdx > 0 ) chunkShift = auxData[ chunkIdx - 1 ]; operation. cudaPerformInPlace( sharedData[ tnlCuda::getInterleaving( idx ) ], chunkShift ); operation.performInPlace( sharedData[ tnlCuda::getInterleaving( idx ) ], chunkShift ); output[ blockOffset + idx ] = sharedData[ tnlCuda::getInterleaving( idx ) ]; idx += blockDim. x; } Loading @@ -136,8 +136,9 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT { if( prefixSumType == exclusivePrefixSum ) auxArray[ blockIdx. x ] = operation. cudaPerform( sharedData[ tnlCuda::getInterleaving( lastElementInBlock - 1 ) ], sharedData[ tnlCuda::getInterleaving( lastElementInBlock ) ] ); operation.commonReductionOnDevice( tnlCuda::getInterleaving( lastElementInBlock - 1 ), tnlCuda::getInterleaving( lastElementInBlock ), sharedData ); else auxArray[ blockIdx. x ] = sharedData[ tnlCuda::getInterleaving( lastElementInBlock - 1 ) ]; } Loading @@ -156,7 +157,7 @@ __global__ void cudaSecondPhaseBlockPrefixSum( const Index size, Operation< DataType > operation; if( blockIdx. x > 0 ) { const Index shift = operation. cudaPerform( gridShift, auxArray[ blockIdx. x - 1 ] ); const Index shift = operation.commonReductionOnDevice( gridShift, auxArray[ blockIdx. x - 1 ] ); const Index readOffset = blockIdx. x * elementsInBlock; Index readIdx = threadIdx. x; Loading Loading @@ -288,12 +289,13 @@ bool cudaGridPrefixSum( enumPrefixSumType prefixSumType, } template< typename DataType, template< typename T > class Operation, typename Operation, typename Index > bool cudaPrefixSum( const Index size, const Index blockSize, const DataType *deviceInput, DataType* deviceOutput, const Operation& operation, const enumPrefixSumType prefixSumType ) { /**** Loading src/implementation/core/vectors/tnlVectorOperationsCuda_impl.h +17 −6 Original line number Diff line number Diff line Loading @@ -565,12 +565,17 @@ void tnlVectorOperations< tnlCuda >::computePrefixSum( Vector& v, typename Vector::IndexType begin, typename Vector::IndexType end ) { typedef tnlParallelReductionSum< typename Vector::IndexType, typename Vector::RealType > OperationType; OperationType operation; cudaPrefixSum< typename Vector::RealType, operationSum, OperationType, typename Vector::IndexType >( end - begin, 256, &v.getData()[ begin ], &v.getData()[ begin ], operation, inclusivePrefixSum ); } Loading @@ -579,12 +584,18 @@ void tnlVectorOperations< tnlCuda >::computeExclusivePrefixSum( Vector& v, typename Vector::IndexType begin, typename Vector::IndexType end ) { typedef tnlParallelReductionSum< typename Vector::IndexType, typename Vector::RealType > OperationType; OperationType operation; cudaPrefixSum< typename Vector::RealType, operationSum, OperationType, typename Vector::IndexType >( end - begin, 256, &v.getData()[ begin ], &v.getData()[ begin ], operation, exclusivePrefixSum ); } Loading Loading
src/core/cuda/cuda-prefix-sum.h +4 −3 Original line number Diff line number Diff line Loading @@ -21,7 +21,7 @@ enum enumPrefixSumType { exclusivePrefixSum = 0, inclusivePrefixSum }; template< typename DataType > /*template< typename DataType > class operationSum { public: Loading Loading @@ -58,16 +58,17 @@ class operationSum }; #endif }; };*/ template< typename DataType, template< typename T > class Operation, typename Operation, typename Index > bool cudaPrefixSum( const Index size, const Index blockSize, const DataType *deviceInput, DataType* deviceOutput, const Operation& operation, const enumPrefixSumType prefixSumType = inclusivePrefixSum ); Loading
src/core/cuda/reduction-operations.h +19 −0 Original line number Diff line number Diff line Loading @@ -176,6 +176,25 @@ class tnlParallelReductionSum { return data[ idx1 ] + data[ idx2 ]; }; __device__ ResultType commonReductionOnDevice( const ResultType& a, const ResultType& b ) const { return a + b; }; __device__ RealType identity() const { return 0; }; __device__ void performInPlace( ResultType& a, const ResultType& b ) { a += b; } #endif }; Loading
src/implementation/core/cuda/cuda-prefix-sum_impl.h +13 −11 Original line number Diff line number Diff line Loading @@ -47,7 +47,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT if( prefixSumType == exclusivePrefixSum ) { if( idx == 0 ) sharedData[ 0 ] = operation. cudaIdentity(); sharedData[ 0 ] = operation.identity(); while( idx < elementsInBlock && blockOffset + idx < size ) { sharedData[ tnlCuda::getInterleaving( idx + 1 ) ] = input[ blockOffset + idx ]; Loading Loading @@ -80,7 +80,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT while( chunkPointer < chunkSize && chunkOffset + chunkPointer < lastElementInBlock ) { operation. cudaPerformInPlace( sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer ) ], operation.performInPlace( sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer ) ], sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer - 1 ) ] ); auxData[ threadIdx. x ] = sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer ) ]; Loading @@ -94,7 +94,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT const int warpIdx = threadIdx. x / tnlCuda::getWarpSize(); for( int stride = 1; stride < tnlCuda::getWarpSize(); stride *= 2 ) if( threadInWarpIdx >= stride && threadIdx. x < numberOfChunks ) operation. cudaPerformInPlace( auxData[ threadIdx. x ], auxData[ threadIdx. x - stride ] ); operation.performInPlace( auxData[ threadIdx. x ], auxData[ threadIdx. x - stride ] ); if( threadInWarpIdx == tnlCuda::getWarpSize() - 1 ) warpSums[ warpIdx ] = auxData[ threadIdx. x ]; Loading @@ -106,14 +106,14 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT if( warpIdx == 0 ) for( int stride = 1; stride < tnlCuda::getWarpSize(); stride *= 2 ) if( threadInWarpIdx >= stride ) operation. cudaPerformInPlace( warpSums[ threadInWarpIdx ], warpSums[ threadInWarpIdx - stride ] ); operation.performInPlace( warpSums[ threadInWarpIdx ], warpSums[ threadInWarpIdx - stride ] ); __syncthreads(); /**** * Shift the warp prefix-sums. */ if( warpIdx > 0 ) operation. cudaPerformInPlace( auxData[ threadIdx. x ], warpSums[ warpIdx - 1 ] ); operation.performInPlace( auxData[ threadIdx. x ], warpSums[ warpIdx - 1 ] ); /*** * Store the result back in global memory. Loading @@ -126,7 +126,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT Index chunkShift( operation. cudaIdentity() ); if( chunkIdx > 0 ) chunkShift = auxData[ chunkIdx - 1 ]; operation. cudaPerformInPlace( sharedData[ tnlCuda::getInterleaving( idx ) ], chunkShift ); operation.performInPlace( sharedData[ tnlCuda::getInterleaving( idx ) ], chunkShift ); output[ blockOffset + idx ] = sharedData[ tnlCuda::getInterleaving( idx ) ]; idx += blockDim. x; } Loading @@ -136,8 +136,9 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT { if( prefixSumType == exclusivePrefixSum ) auxArray[ blockIdx. x ] = operation. cudaPerform( sharedData[ tnlCuda::getInterleaving( lastElementInBlock - 1 ) ], sharedData[ tnlCuda::getInterleaving( lastElementInBlock ) ] ); operation.commonReductionOnDevice( tnlCuda::getInterleaving( lastElementInBlock - 1 ), tnlCuda::getInterleaving( lastElementInBlock ), sharedData ); else auxArray[ blockIdx. x ] = sharedData[ tnlCuda::getInterleaving( lastElementInBlock - 1 ) ]; } Loading @@ -156,7 +157,7 @@ __global__ void cudaSecondPhaseBlockPrefixSum( const Index size, Operation< DataType > operation; if( blockIdx. x > 0 ) { const Index shift = operation. cudaPerform( gridShift, auxArray[ blockIdx. x - 1 ] ); const Index shift = operation.commonReductionOnDevice( gridShift, auxArray[ blockIdx. x - 1 ] ); const Index readOffset = blockIdx. x * elementsInBlock; Index readIdx = threadIdx. x; Loading Loading @@ -288,12 +289,13 @@ bool cudaGridPrefixSum( enumPrefixSumType prefixSumType, } template< typename DataType, template< typename T > class Operation, typename Operation, typename Index > bool cudaPrefixSum( const Index size, const Index blockSize, const DataType *deviceInput, DataType* deviceOutput, const Operation& operation, const enumPrefixSumType prefixSumType ) { /**** Loading
src/implementation/core/vectors/tnlVectorOperationsCuda_impl.h +17 −6 Original line number Diff line number Diff line Loading @@ -565,12 +565,17 @@ void tnlVectorOperations< tnlCuda >::computePrefixSum( Vector& v, typename Vector::IndexType begin, typename Vector::IndexType end ) { typedef tnlParallelReductionSum< typename Vector::IndexType, typename Vector::RealType > OperationType; OperationType operation; cudaPrefixSum< typename Vector::RealType, operationSum, OperationType, typename Vector::IndexType >( end - begin, 256, &v.getData()[ begin ], &v.getData()[ begin ], operation, inclusivePrefixSum ); } Loading @@ -579,12 +584,18 @@ void tnlVectorOperations< tnlCuda >::computeExclusivePrefixSum( Vector& v, typename Vector::IndexType begin, typename Vector::IndexType end ) { typedef tnlParallelReductionSum< typename Vector::IndexType, typename Vector::RealType > OperationType; OperationType operation; cudaPrefixSum< typename Vector::RealType, operationSum, OperationType, typename Vector::IndexType >( end - begin, 256, &v.getData()[ begin ], &v.getData()[ begin ], operation, exclusivePrefixSum ); } Loading