Commit 27a355c6 authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Fixed block size in ParallelFor3D and improved heuristics

parent 7930075e
Loading
Loading
Loading
Loading
+21 −5
Original line number Diff line number Diff line
@@ -288,20 +288,36 @@ struct ParallelFor3D< Devices::Cuda >
         const Index sizeZ = endZ - startZ;

         dim3 blockSize;
         if( sizeX >= sizeY * sizeZ ) {
         if( sizeX >= sizeY * sizeY * sizeZ * sizeZ ) {
            blockSize.x = TNL::min( 256, sizeX );
            blockSize.y = 1;
            blockSize.z = 1;
         }
         else if( sizeY >= sizeX * sizeZ ) {
         else if( sizeY >= sizeX * sizeX * sizeZ * sizeZ ) {
            blockSize.x = 1;
            blockSize.y = TNL::min( 256, sizeY );
            blockSize.z = 1;
         }
         else if( sizeZ >= sizeX * sizeY ) {
            blockSize.x = 1;
         else if( sizeZ >= sizeX * sizeX * sizeY * sizeY ) {
            blockSize.x = TNL::min( 2, sizeX );
            blockSize.y = TNL::min( 2, sizeY );
            // CUDA allows max 64 for blockSize.z
            blockSize.z = TNL::min( 64, sizeZ );
         }
         else if( sizeX >= sizeZ * sizeZ && sizeY >= sizeZ * sizeZ ) {
            blockSize.x = TNL::min( 32, sizeX );
            blockSize.y = TNL::min( 8, sizeY );
            blockSize.z = 1;
         }
         else if( sizeX >= sizeY * sizeY && sizeZ >= sizeY * sizeY ) {
            blockSize.x = TNL::min( 32, sizeX );
            blockSize.y = 1;
            blockSize.z = TNL::min( 256, sizeZ );
            blockSize.z = TNL::min( 8, sizeZ );
         }
         else if( sizeY >= sizeX * sizeX && sizeZ >= sizeX * sizeX ) {
            blockSize.x = 1;
            blockSize.y = TNL::min( 32, sizeY );
            blockSize.z = TNL::min( 8, sizeZ );
         }
         else {
            blockSize.x = TNL::min( 16, sizeX );