Loading src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h +11 −23 Original line number Diff line number Diff line Loading @@ -34,8 +34,7 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, const Index elementsInBlock, const Real* input, Real* output, Real* auxArray, const Real gridShift ) Real* auxArray ) { Real* sharedData = TNL::Devices::Cuda::getSharedMemory< Real >(); Real* auxData = &sharedData[ elementsInBlock + elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2 ]; Loading Loading @@ -67,8 +66,6 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, idx += blockDim.x; } } if( blockIdx.x == 0 && threadIdx.x == 0 ) sharedData[ 0 ] = reduction( sharedData[ 0 ], gridShift ); /*** * Perform the sequential prefix-sum. Loading Loading @@ -150,10 +147,8 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, { if( prefixSumType == PrefixSumType::Exclusive ) { Real aux = zero; aux = reduction( aux, sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ] ); aux = reduction( aux, sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock ) ] ); auxArray[ blockIdx.x ] = aux; auxArray[ blockIdx.x ] = reduction( sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ], sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock ) ] ); } else auxArray[ blockIdx.x ] = sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ]; Loading @@ -172,17 +167,15 @@ cudaSecondPhaseBlockPrefixSum( Reduction reduction, Real* data ) { if( blockIdx.x > 0 ) { const Real shift = auxArray[ blockIdx.x - 1 ]; gridShift = reduction( gridShift, auxArray[ blockIdx.x - 1 ] ); const Index readOffset = blockIdx.x * elementsInBlock; Index readIdx = threadIdx.x; while( readIdx < elementsInBlock && readOffset + readIdx < size ) { data[ readIdx + readOffset ] = reduction( data[ readIdx + readOffset ], shift ); data[ readIdx + readOffset ] = reduction( data[ readIdx + readOffset ], gridShift ); readIdx += blockDim.x; } } } template< PrefixSumType prefixSumType, typename Real, Loading Loading @@ -229,8 +222,7 @@ struct CudaPrefixSumKernelLauncher elementsInBlock, input, output, auxArray1.getData(), gridShift ); auxArray1.getData() ); cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; Loading Loading @@ -264,12 +256,8 @@ struct CudaPrefixSumKernelLauncher cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; cudaMemcpy( &gridShift, &auxArray2[ auxArraySize - 1 ], sizeof( Real ), cudaMemcpyDeviceToHost ); gridShift = auxArray2.getElement( auxArraySize - 1 ); //std::cerr << "gridShift = " << gridShift << std::endl; TNL_CHECK_CUDA_DEVICE; } /**** Loading Loading
src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h +11 −23 Original line number Diff line number Diff line Loading @@ -34,8 +34,7 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, const Index elementsInBlock, const Real* input, Real* output, Real* auxArray, const Real gridShift ) Real* auxArray ) { Real* sharedData = TNL::Devices::Cuda::getSharedMemory< Real >(); Real* auxData = &sharedData[ elementsInBlock + elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2 ]; Loading Loading @@ -67,8 +66,6 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, idx += blockDim.x; } } if( blockIdx.x == 0 && threadIdx.x == 0 ) sharedData[ 0 ] = reduction( sharedData[ 0 ], gridShift ); /*** * Perform the sequential prefix-sum. Loading Loading @@ -150,10 +147,8 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, { if( prefixSumType == PrefixSumType::Exclusive ) { Real aux = zero; aux = reduction( aux, sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ] ); aux = reduction( aux, sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock ) ] ); auxArray[ blockIdx.x ] = aux; auxArray[ blockIdx.x ] = reduction( sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ], sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock ) ] ); } else auxArray[ blockIdx.x ] = sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ]; Loading @@ -172,17 +167,15 @@ cudaSecondPhaseBlockPrefixSum( Reduction reduction, Real* data ) { if( blockIdx.x > 0 ) { const Real shift = auxArray[ blockIdx.x - 1 ]; gridShift = reduction( gridShift, auxArray[ blockIdx.x - 1 ] ); const Index readOffset = blockIdx.x * elementsInBlock; Index readIdx = threadIdx.x; while( readIdx < elementsInBlock && readOffset + readIdx < size ) { data[ readIdx + readOffset ] = reduction( data[ readIdx + readOffset ], shift ); data[ readIdx + readOffset ] = reduction( data[ readIdx + readOffset ], gridShift ); readIdx += blockDim.x; } } } template< PrefixSumType prefixSumType, typename Real, Loading Loading @@ -229,8 +222,7 @@ struct CudaPrefixSumKernelLauncher elementsInBlock, input, output, auxArray1.getData(), gridShift ); auxArray1.getData() ); cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; Loading Loading @@ -264,12 +256,8 @@ struct CudaPrefixSumKernelLauncher cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; cudaMemcpy( &gridShift, &auxArray2[ auxArraySize - 1 ], sizeof( Real ), cudaMemcpyDeviceToHost ); gridShift = auxArray2.getElement( auxArraySize - 1 ); //std::cerr << "gridShift = " << gridShift << std::endl; TNL_CHECK_CUDA_DEVICE; } /**** Loading