diff --git a/src/TNL/ParallelFor.h b/src/TNL/ParallelFor.h index d0c2d0601b7cbf565b3f602127c54e04ab229c80..9989954b56101398ac8cbf6aa8c9b1344a67e44a 100644 --- a/src/TNL/ParallelFor.h +++ b/src/TNL/ParallelFor.h @@ -288,20 +288,36 @@ struct ParallelFor3D< Devices::Cuda > const Index sizeZ = endZ - startZ; dim3 blockSize; - if( sizeX >= sizeY * sizeZ ) { + if( sizeX >= sizeY * sizeY * sizeZ * sizeZ ) { blockSize.x = TNL::min( 256, sizeX ); blockSize.y = 1; blockSize.z = 1; } - else if( sizeY >= sizeX * sizeZ ) { + else if( sizeY >= sizeX * sizeX * sizeZ * sizeZ ) { blockSize.x = 1; blockSize.y = TNL::min( 256, sizeY ); blockSize.z = 1; } - else if( sizeZ >= sizeX * sizeY ) { - blockSize.x = 1; + else if( sizeZ >= sizeX * sizeX * sizeY * sizeY ) { + blockSize.x = TNL::min( 2, sizeX ); + blockSize.y = TNL::min( 2, sizeY ); + // CUDA allows max 64 for blockSize.z + blockSize.z = TNL::min( 64, sizeZ ); + } + else if( sizeX >= sizeZ * sizeZ && sizeY >= sizeZ * sizeZ ) { + blockSize.x = TNL::min( 32, sizeX ); + blockSize.y = TNL::min( 8, sizeY ); + blockSize.z = 1; + } + else if( sizeX >= sizeY * sizeY && sizeZ >= sizeY * sizeY ) { + blockSize.x = TNL::min( 32, sizeX ); blockSize.y = 1; - blockSize.z = TNL::min( 256, sizeZ ); + blockSize.z = TNL::min( 8, sizeZ ); + } + else if( sizeY >= sizeX * sizeX && sizeZ >= sizeX * sizeX ) { + blockSize.x = 1; + blockSize.y = TNL::min( 32, sizeY ); + blockSize.z = TNL::min( 8, sizeZ ); } else { blockSize.x = TNL::min( 16, sizeX );