Loading src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h +9 −8 Original line number Original line Diff line number Diff line Loading @@ -220,7 +220,7 @@ struct CudaPrefixSumKernelLauncher */ */ const std::size_t sharedDataSize = elementsInBlock + const std::size_t sharedDataSize = elementsInBlock + elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2; elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2; const std::size_t sharedMemory = ( sharedDataSize + blockSize + Devices::Cuda::getWarpSize() ) * sizeof( Real ); const std::size_t sharedMemory = ( sharedDataSize + blockSize + Devices::Cuda::getWarpSize() ) * sizeof( Real ); cudaFirstPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize, sharedMemory >>> cudaFirstPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( prefixSumType_, ( prefixSumType_, reduction, reduction, Loading @@ -231,6 +231,7 @@ struct CudaPrefixSumKernelLauncher output, output, auxArray1.getData(), auxArray1.getData(), gridShift ); gridShift ); cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; TNL_CHECK_CUDA_DEVICE; Loading Loading @@ -260,6 +261,7 @@ struct CudaPrefixSumKernelLauncher gridShift, gridShift, auxArray2.getData(), auxArray2.getData(), output ); output ); cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; TNL_CHECK_CUDA_DEVICE; cudaMemcpy( &gridShift, cudaMemcpy( &gridShift, Loading @@ -284,11 +286,11 @@ struct CudaPrefixSumKernelLauncher template< typename Reduction > template< typename Reduction > static void static void start( const Index size, start( const Index size, const Index blockSize, const Index blockSize, const Real *deviceInput, const Real *deviceInput, Real* deviceOutput, Real* deviceOutput, Reduction& reduction, Reduction& reduction, const Real& zero ) const Real& zero ) { { /**** /**** * Compute the number of grids * Compute the number of grids Loading Loading @@ -323,11 +325,10 @@ struct CudaPrefixSumKernelLauncher gridShift, gridShift, &deviceInput[ gridOffset ], &deviceInput[ gridOffset ], &deviceOutput[ gridOffset ] ); &deviceOutput[ gridOffset ] ); TNL_CHECK_CUDA_DEVICE; } } /*** /*** * Store the number of CUDA grids for a purpose of unit testing, i.e. * Store the number of CUDA grids for the purpose of unit testing, i.e. * to check if we test the algorithm with more than one CUDA grid. * to check if we test the algorithm with more than one CUDA grid. */ */ gridsCount = numberOfGrids; gridsCount = numberOfGrids; Loading Loading
src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h +9 −8 Original line number Original line Diff line number Diff line Loading @@ -220,7 +220,7 @@ struct CudaPrefixSumKernelLauncher */ */ const std::size_t sharedDataSize = elementsInBlock + const std::size_t sharedDataSize = elementsInBlock + elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2; elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2; const std::size_t sharedMemory = ( sharedDataSize + blockSize + Devices::Cuda::getWarpSize() ) * sizeof( Real ); const std::size_t sharedMemory = ( sharedDataSize + blockSize + Devices::Cuda::getWarpSize() ) * sizeof( Real ); cudaFirstPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize, sharedMemory >>> cudaFirstPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( prefixSumType_, ( prefixSumType_, reduction, reduction, Loading @@ -231,6 +231,7 @@ struct CudaPrefixSumKernelLauncher output, output, auxArray1.getData(), auxArray1.getData(), gridShift ); gridShift ); cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; TNL_CHECK_CUDA_DEVICE; Loading Loading @@ -260,6 +261,7 @@ struct CudaPrefixSumKernelLauncher gridShift, gridShift, auxArray2.getData(), auxArray2.getData(), output ); output ); cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; TNL_CHECK_CUDA_DEVICE; cudaMemcpy( &gridShift, cudaMemcpy( &gridShift, Loading @@ -284,11 +286,11 @@ struct CudaPrefixSumKernelLauncher template< typename Reduction > template< typename Reduction > static void static void start( const Index size, start( const Index size, const Index blockSize, const Index blockSize, const Real *deviceInput, const Real *deviceInput, Real* deviceOutput, Real* deviceOutput, Reduction& reduction, Reduction& reduction, const Real& zero ) const Real& zero ) { { /**** /**** * Compute the number of grids * Compute the number of grids Loading Loading @@ -323,11 +325,10 @@ struct CudaPrefixSumKernelLauncher gridShift, gridShift, &deviceInput[ gridOffset ], &deviceInput[ gridOffset ], &deviceOutput[ gridOffset ] ); &deviceOutput[ gridOffset ] ); TNL_CHECK_CUDA_DEVICE; } } /*** /*** * Store the number of CUDA grids for a purpose of unit testing, i.e. * Store the number of CUDA grids for the purpose of unit testing, i.e. * to check if we test the algorithm with more than one CUDA grid. * to check if we test the algorithm with more than one CUDA grid. */ */ gridsCount = numberOfGrids; gridsCount = numberOfGrids; Loading