Loading src/TNL/Containers/Algorithms/CudaScanKernel.h +13 −13 Original line number Diff line number Diff line Loading @@ -27,7 +27,7 @@ template< typename Real, typename Reduction, typename Index > __global__ void cudaFirstPhaseBlockScan( const ScanType ScanType, cudaFirstPhaseBlockScan( const ScanType scanType, Reduction reduction, const Real zero, const Index size, Loading @@ -48,7 +48,7 @@ cudaFirstPhaseBlockScan( const ScanType ScanType, */ const int blockOffset = blockIdx.x * elementsInBlock; int idx = threadIdx.x; if( ScanType == ScanType::Exclusive ) if( scanType == ScanType::Exclusive ) { if( idx == 0 ) sharedData[ 0 ] = zero; Loading Loading @@ -145,7 +145,7 @@ cudaFirstPhaseBlockScan( const ScanType ScanType, if( threadIdx.x == 0 ) { if( ScanType == ScanType::Exclusive ) if( scanType == ScanType::Exclusive ) { auxArray[ blockIdx.x ] = reduction( sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ], sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock ) ] ); Loading Loading @@ -179,7 +179,7 @@ cudaSecondPhaseBlockScan( Reduction reduction, } } template< ScanType ScanType, template< ScanType scanType, typename Real, typename Index > struct CudaScanKernelLauncher Loading Loading @@ -271,7 +271,7 @@ struct CudaScanKernelLauncher elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2; const std::size_t sharedMemory = ( sharedDataSize + blockSize + Devices::Cuda::getWarpSize() ) * sizeof( Real ); cudaFirstPhaseBlockScan<<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( ScanType, ( scanType, reduction, zero, currentSize, Loading Loading @@ -306,7 +306,7 @@ struct CudaScanKernelLauncher } /**** * \brief Performs the seocond phase of prefix sum. * \brief Performs the second phase of prefix sum. * * \param size Number of elements to be scanned. * \param deviceOutput Pointer to output array on GPU. Loading Loading
src/TNL/Containers/Algorithms/CudaScanKernel.h +13 −13 Original line number Diff line number Diff line Loading @@ -27,7 +27,7 @@ template< typename Real, typename Reduction, typename Index > __global__ void cudaFirstPhaseBlockScan( const ScanType ScanType, cudaFirstPhaseBlockScan( const ScanType scanType, Reduction reduction, const Real zero, const Index size, Loading @@ -48,7 +48,7 @@ cudaFirstPhaseBlockScan( const ScanType ScanType, */ const int blockOffset = blockIdx.x * elementsInBlock; int idx = threadIdx.x; if( ScanType == ScanType::Exclusive ) if( scanType == ScanType::Exclusive ) { if( idx == 0 ) sharedData[ 0 ] = zero; Loading Loading @@ -145,7 +145,7 @@ cudaFirstPhaseBlockScan( const ScanType ScanType, if( threadIdx.x == 0 ) { if( ScanType == ScanType::Exclusive ) if( scanType == ScanType::Exclusive ) { auxArray[ blockIdx.x ] = reduction( sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ], sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock ) ] ); Loading Loading @@ -179,7 +179,7 @@ cudaSecondPhaseBlockScan( Reduction reduction, } } template< ScanType ScanType, template< ScanType scanType, typename Real, typename Index > struct CudaScanKernelLauncher Loading Loading @@ -271,7 +271,7 @@ struct CudaScanKernelLauncher elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2; const std::size_t sharedMemory = ( sharedDataSize + blockSize + Devices::Cuda::getWarpSize() ) * sizeof( Real ); cudaFirstPhaseBlockScan<<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( ScanType, ( scanType, reduction, zero, currentSize, Loading Loading @@ -306,7 +306,7 @@ struct CudaScanKernelLauncher } /**** * \brief Performs the seocond phase of prefix sum. * \brief Performs the second phase of prefix sum. * * \param size Number of elements to be scanned. * \param deviceOutput Pointer to output array on GPU. Loading