From 27a355c65722ac58ecd3322eabe9a93a1d1d8406 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz> Date: Sat, 9 Feb 2019 21:39:00 +0100 Subject: [PATCH] Fixed block size in ParallelFor3D and improved heuristics --- src/TNL/ParallelFor.h | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/src/TNL/ParallelFor.h b/src/TNL/ParallelFor.h index d0c2d0601b..9989954b56 100644 --- a/src/TNL/ParallelFor.h +++ b/src/TNL/ParallelFor.h @@ -288,20 +288,36 @@ struct ParallelFor3D< Devices::Cuda > const Index sizeZ = endZ - startZ; dim3 blockSize; - if( sizeX >= sizeY * sizeZ ) { + if( sizeX >= sizeY * sizeY * sizeZ * sizeZ ) { blockSize.x = TNL::min( 256, sizeX ); blockSize.y = 1; blockSize.z = 1; } - else if( sizeY >= sizeX * sizeZ ) { + else if( sizeY >= sizeX * sizeX * sizeZ * sizeZ ) { blockSize.x = 1; blockSize.y = TNL::min( 256, sizeY ); blockSize.z = 1; } - else if( sizeZ >= sizeX * sizeY ) { - blockSize.x = 1; + else if( sizeZ >= sizeX * sizeX * sizeY * sizeY ) { + blockSize.x = TNL::min( 2, sizeX ); + blockSize.y = TNL::min( 2, sizeY ); + // CUDA allows max 64 for blockSize.z + blockSize.z = TNL::min( 64, sizeZ ); + } + else if( sizeX >= sizeZ * sizeZ && sizeY >= sizeZ * sizeZ ) { + blockSize.x = TNL::min( 32, sizeX ); + blockSize.y = TNL::min( 8, sizeY ); + blockSize.z = 1; + } + else if( sizeX >= sizeY * sizeY && sizeZ >= sizeY * sizeY ) { + blockSize.x = TNL::min( 32, sizeX ); blockSize.y = 1; - blockSize.z = TNL::min( 256, sizeZ ); + blockSize.z = TNL::min( 8, sizeZ ); + } + else if( sizeY >= sizeX * sizeX && sizeZ >= sizeX * sizeX ) { + blockSize.x = 1; + blockSize.y = TNL::min( 32, sizeY ); + blockSize.z = TNL::min( 8, sizeZ ); } else { blockSize.x = TNL::min( 16, sizeX ); -- GitLab