Loading src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h +11 −23 Original line number Original line Diff line number Diff line Loading @@ -34,8 +34,7 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, const Index elementsInBlock, const Index elementsInBlock, const Real* input, const Real* input, Real* output, Real* output, Real* auxArray, Real* auxArray ) const Real gridShift ) { { Real* sharedData = TNL::Devices::Cuda::getSharedMemory< Real >(); Real* sharedData = TNL::Devices::Cuda::getSharedMemory< Real >(); Real* auxData = &sharedData[ elementsInBlock + elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2 ]; Real* auxData = &sharedData[ elementsInBlock + elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2 ]; Loading Loading @@ -67,8 +66,6 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, idx += blockDim.x; idx += blockDim.x; } } } } if( blockIdx.x == 0 && threadIdx.x == 0 ) sharedData[ 0 ] = reduction( sharedData[ 0 ], gridShift ); /*** /*** * Perform the sequential prefix-sum. * Perform the sequential prefix-sum. Loading Loading @@ -150,10 +147,8 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, { { if( prefixSumType == PrefixSumType::Exclusive ) if( prefixSumType == PrefixSumType::Exclusive ) { { Real aux = zero; auxArray[ blockIdx.x ] = reduction( sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ], aux = reduction( aux, sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ] ); sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock ) ] ); aux = reduction( aux, sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock ) ] ); auxArray[ blockIdx.x ] = aux; } } else else auxArray[ blockIdx.x ] = sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ]; auxArray[ blockIdx.x ] = sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ]; Loading @@ -172,17 +167,15 @@ cudaSecondPhaseBlockPrefixSum( Reduction reduction, Real* data ) Real* data ) { { if( blockIdx.x > 0 ) if( blockIdx.x > 0 ) { gridShift = reduction( gridShift, auxArray[ blockIdx.x - 1 ] ); const Real shift = auxArray[ blockIdx.x - 1 ]; const Index readOffset = blockIdx.x * elementsInBlock; const Index readOffset = blockIdx.x * elementsInBlock; Index readIdx = threadIdx.x; Index readIdx = threadIdx.x; while( readIdx < elementsInBlock && readOffset + readIdx < size ) while( readIdx < elementsInBlock && readOffset + readIdx < size ) { { data[ readIdx + readOffset ] = reduction( data[ readIdx + readOffset ], shift ); data[ readIdx + readOffset ] = reduction( data[ readIdx + readOffset ], gridShift ); readIdx += blockDim.x; readIdx += blockDim.x; } } } } } template< PrefixSumType prefixSumType, template< PrefixSumType prefixSumType, typename Real, typename Real, Loading Loading @@ -229,8 +222,7 @@ struct CudaPrefixSumKernelLauncher elementsInBlock, elementsInBlock, input, input, output, output, auxArray1.getData(), auxArray1.getData() ); gridShift ); cudaStreamSynchronize(0); cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; TNL_CHECK_CUDA_DEVICE; Loading Loading @@ -264,12 +256,8 @@ struct CudaPrefixSumKernelLauncher cudaStreamSynchronize(0); cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; TNL_CHECK_CUDA_DEVICE; cudaMemcpy( &gridShift, gridShift = auxArray2.getElement( auxArraySize - 1 ); &auxArray2[ auxArraySize - 1 ], sizeof( Real ), cudaMemcpyDeviceToHost ); //std::cerr << "gridShift = " << gridShift << std::endl; //std::cerr << "gridShift = " << gridShift << std::endl; TNL_CHECK_CUDA_DEVICE; } } /**** /**** Loading Loading
src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h +11 −23 Original line number Original line Diff line number Diff line Loading @@ -34,8 +34,7 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, const Index elementsInBlock, const Index elementsInBlock, const Real* input, const Real* input, Real* output, Real* output, Real* auxArray, Real* auxArray ) const Real gridShift ) { { Real* sharedData = TNL::Devices::Cuda::getSharedMemory< Real >(); Real* sharedData = TNL::Devices::Cuda::getSharedMemory< Real >(); Real* auxData = &sharedData[ elementsInBlock + elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2 ]; Real* auxData = &sharedData[ elementsInBlock + elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2 ]; Loading Loading @@ -67,8 +66,6 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, idx += blockDim.x; idx += blockDim.x; } } } } if( blockIdx.x == 0 && threadIdx.x == 0 ) sharedData[ 0 ] = reduction( sharedData[ 0 ], gridShift ); /*** /*** * Perform the sequential prefix-sum. * Perform the sequential prefix-sum. Loading Loading @@ -150,10 +147,8 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, { { if( prefixSumType == PrefixSumType::Exclusive ) if( prefixSumType == PrefixSumType::Exclusive ) { { Real aux = zero; auxArray[ blockIdx.x ] = reduction( sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ], aux = reduction( aux, sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ] ); sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock ) ] ); aux = reduction( aux, sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock ) ] ); auxArray[ blockIdx.x ] = aux; } } else else auxArray[ blockIdx.x ] = sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ]; auxArray[ blockIdx.x ] = sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ]; Loading @@ -172,17 +167,15 @@ cudaSecondPhaseBlockPrefixSum( Reduction reduction, Real* data ) Real* data ) { { if( blockIdx.x > 0 ) if( blockIdx.x > 0 ) { gridShift = reduction( gridShift, auxArray[ blockIdx.x - 1 ] ); const Real shift = auxArray[ blockIdx.x - 1 ]; const Index readOffset = blockIdx.x * elementsInBlock; const Index readOffset = blockIdx.x * elementsInBlock; Index readIdx = threadIdx.x; Index readIdx = threadIdx.x; while( readIdx < elementsInBlock && readOffset + readIdx < size ) while( readIdx < elementsInBlock && readOffset + readIdx < size ) { { data[ readIdx + readOffset ] = reduction( data[ readIdx + readOffset ], shift ); data[ readIdx + readOffset ] = reduction( data[ readIdx + readOffset ], gridShift ); readIdx += blockDim.x; readIdx += blockDim.x; } } } } } template< PrefixSumType prefixSumType, template< PrefixSumType prefixSumType, typename Real, typename Real, Loading Loading @@ -229,8 +222,7 @@ struct CudaPrefixSumKernelLauncher elementsInBlock, elementsInBlock, input, input, output, output, auxArray1.getData(), auxArray1.getData() ); gridShift ); cudaStreamSynchronize(0); cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; TNL_CHECK_CUDA_DEVICE; Loading Loading @@ -264,12 +256,8 @@ struct CudaPrefixSumKernelLauncher cudaStreamSynchronize(0); cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; TNL_CHECK_CUDA_DEVICE; cudaMemcpy( &gridShift, gridShift = auxArray2.getElement( auxArraySize - 1 ); &auxArray2[ auxArraySize - 1 ], sizeof( Real ), cudaMemcpyDeviceToHost ); //std::cerr << "gridShift = " << gridShift << std::endl; //std::cerr << "gridShift = " << gridShift << std::endl; TNL_CHECK_CUDA_DEVICE; } } /**** /**** Loading