Commit 064ee69e authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

working on the MHFEM section - tables with CT portions showing TNL vs Hypre

parent b81e52dc
Loading
Loading
Loading
Loading
+15 −22
Original line number Diff line number Diff line
@@ -103,7 +103,7 @@ Comparing the computational times on the mesh 3D$^\triangle_5$ from \cref{tab:co

\begin{table}[!tb]
    \caption{
        Comparison of computational times $CT \, [\text{s}]$, speed-up $Sp$, and parallel efficiency $E\!f\!f$ of the OpenMP and MPI-based CPU computations for the generalized McWhorter--Sunada problem on the finest triangular mesh 2D$^\triangle_5$.
        Comparison of CPU computational times $CT \, [\text{s}]$, speed-up $Sp$, and parallel efficiency $E\!f\!f$ for the generalized McWhorter--Sunada problem computed on the finest triangular mesh 2D$^\triangle_5$.
    }
    \label{tab:mhfem:comptimes:CPU 2D}
    \centering
@@ -112,7 +112,7 @@ Comparing the computational times on the mesh 3D$^\triangle_5$ from \cref{tab:co

\begin{table}[!tb]
    \caption{
        Comparison of computational times $CT \, [\text{s}]$, speed-up $Sp$, and parallel efficiency $E\!f\!f$ of the OpenMP and MPI-based CPU computations for the generalized McWhorter--Sunada problem on the finest tetrahedral mesh 3D$^\triangle_5$.
        Comparison of CPU computational times $CT \, [\text{s}]$, speed-up $Sp$, and parallel efficiency $E\!f\!f$ for the generalized McWhorter--Sunada problem computed on the finest tetrahedral mesh 3D$^\triangle_5$.
    }
    \label{tab:mhfem:comptimes:CPU 3D}
    \centering
@@ -131,8 +131,8 @@ The speed-ups could be improved by optimizing the linear system solver (BiCGstab

\begin{table}[tb]
    \caption{
        Comparison of computational times $CT \, [\text{s}]$, speed-up $Sp$, and parallel efficiency $E\!f\!f$ of MPI-based GPU computations for the generalized McWhorter--Sunada problem on the finest triangular and tetrahedral meshes.
        Each rank manages its dedicated GPU.
        Comparison of GPU computational times $CT \, [\text{s}]$, speed-up $Sp$, and parallel efficiency $E\!f\!f$ for the generalized McWhorter--Sunada problem computed on the finest triangular and tetrahedral meshes.
        Each MPI rank manages its dedicated GPU.
    }
    \label{tab:mhfem:comptimes:GPU}
    \centering
@@ -148,26 +148,19 @@ Before each sparse matrix--vector multiply, the ghost regions of the input vecto
The communication for dot products is implemented using the \ic{MPI_Allreduce} function and its contribution to the computational time is included in the fifth row in \cref{tab:comptimes:portions}.
The remaining operations, such as the sparse matrix assembly and various operations for the MHFEM discretization contribute only a small portion to the total computational time, which is even smaller than the communication time required for the linear system solver.

\begin{table}[bth]
\begin{table}[tb]
    \caption{
        Comparison of the portions contributing to the total computational times $CT \, [\text{s}]$ of MPI-based CPU computations for the generalized McWhorter--Sunada problem on the finest tetrahedral mesh 3D$^\triangle_5$.
        Comparison of the portions contributing to the total CPU computational times $CT \, [\text{s}]$ for the generalized McWhorter--Sunada problem computed on the finest tetrahedral mesh 3D$^\triangle_5$ using BiCGstab from TNL as the linear system solver.
        All values are average times of all MPI ranks.
    }
    \label{tab:comptimes:portions}
    \centering
    \scalebox{0.95}{
        \begin{tabular}{lrrrrrrr}
            \toprule
            Number of CPU cores                            & 12 & 24 & 48 & 96 & 192 & 288 & 384 \\
            \midrule
            MHFEM routines                                 &    217.2 &    109.4 &     55.6 &     27.8 &      13.9 &       9.3 &       7.0 \\
            Sparse matrix assembly                         &    147.4 &     72.1 &     35.4 &     17.1 &       8.1 &       5.3 &       3.9 \\
            Linear system solver                           &  22608.9 &  11089.3 &   5500.3 &   2594.9 &    1101.1 &     680.9 &     473.1 \\
            Mesh data communication over MPI               &    659.3 &    676.6 &    348.8 &    210.8 &     132.4 &     105.4 &      87.6 \\
            MPI collective operations (\ic{MPI_Allreduce}) &    316.7 &    308.1 &    231.2 &    165.4 &     154.9 &     123.3 &     130.9 \\
            \midrule
            Total                                          &  23949.5 &  12255.5 &   6171.4 &   3016.0 &    1410.4 &     924.2 &     702.4 \\
            \bottomrule
        \end{tabular}
    \label{tab:mhfem:portions:TNL}
    \input{./data/mcwhdd/portions_tnl.tex}
\end{table}
\begin{table}[tb]
    \caption{
        Comparison of the portions contributing to the total CPU computational times $CT \, [\text{s}]$ for the generalized McWhorter--Sunada problem computed on the finest tetrahedral mesh 3D$^\triangle_5$ using BoomerAMG-preconditioned BiCGstab from the Hypre library as the linear system solver.
        All values are average times of all MPI ranks.
    }
    \label{tab:mhfem:portions:Hypre}
    \input{./data/mcwhdd/portions_hypre.tex}
\end{table}
 No newline at end of file
+97 −2
Original line number Diff line number Diff line
@@ -106,7 +106,14 @@ def parse_log(log: Path):
        if match:
            key = match.group("key")
            value = match.group("value")
            if value.strip():
            if not value.strip():
                continue

            # parse complicated fields
            match = re.search(r"avg: (?P<value>[\d.e+-]+) ", value)
            if match:
                value = match.group("value")

            result.setdefault(key, value)

    # add MPI processes from the file path (old logs don't have this field)
@@ -328,6 +335,76 @@ def make_table_gpu(df):

    return df

def make_table_portions(df, meshid, cores):
    # filter meshid
    df = df[df["meshid"].map(lambda x: x == meshid)]

    # filter cores
    df = df[df["MPI ranks"].map(lambda x: x in cores)]
    df["MPI ranks"] = df["MPI ranks"].map(int)

    # rename some columns
    df.rename(columns={
        "MPI ranks": "Number of MPI ranks",
    }, inplace=True)

    # set the index
    df.set_index("Number of MPI ranks", inplace=True)
    df.sort_index(inplace=True)

    # remove uninteresting columns
    del df["meshid"]
    del df["Host name"]
    assert (df["Device"] == "CPU").all()
    del df["Device"]
    assert (df["OMP enabled"] == "no").all()
    del df["OMP enabled"]
    assert (df["OMP threads"] == 1).all()
    del df["OMP threads"]

#    print(df.T)

    # create a new dataframe and add the combined times
    df2 = pandas.DataFrame()
    df2["MHFEM routines"] = df["Pre-iterate time"] + df["Post-iterate time"] - df["upwind MPI synchronization time"]
    df2["MPI communication of mesh data"] = df["upwind MPI synchronization time"]
    df2["Sparse matrix assembly"] = df["Linear system assembler time"]
    if "Hypre solve time" in df:
        df2["BoomerAMG update"] = df["Hypre setup time"]
        df2["Linear system solver"] = df["Hypre solve time"]
        if "Hypre result MPI synchronization time" in df:
            df2["MPI communication of mesh data"] += df["Hypre result MPI synchronization time"]
            compensate = None
        else:
            # missing entry in old logs - fortunately it is negligible
            compensate = "MPI communication of mesh data"
    else:
        # unify old and new logs
        df["faceSynchronizer async operations time"] = df["MPI synchronization time"].fillna(0) + df["faceSynchronizer async operations time"].fillna(0)

        df2["Linear system solver (compute)"] = df["Linear preconditioner update time"] + df["Linear system solver time"] - df["faceSynchronizer async operations time"] - df["MPI_Allreduce time"]
#        df2["Linear system solver (MPI communication)"] = df["faceSynchronizer async operations time"]
#        df2[r"Linear system solver (MPI all-reduction)"] = df["MPI_Allreduce time"]
        df2[r"Linear system solver (MPI sync.)"] = df["faceSynchronizer async operations time"] + df["MPI_Allreduce time"]
        # the difference is due to averaging over the MPI ranks - collective operations are pipelined...
#        compensate = "Linear system solver (MPI all-reduction)"
        compensate = "Linear system solver (MPI sync.)"
    df2["Total"] = df["Compute time"]
    # compensate the difference in total
    if compensate is not None:
        df2[compensate] += 2 * df2["Total"] - df2.sum(axis=1)
#    df2["Total (check)"] = df2.sum(axis=1) - df2["Total"]

    # convert to percentages
    for c in df2.columns:
        if c == "Total":
            # just round and add the unit
            df2[c] = df2[c].map(lambda v: "{:.0f} s".format(v))
        else:
            df2[c] = (df2[c] / df2["Total"] * 100).map(lambda v: "{:.1f} %".format(v))

    return df2.T

# load each directory as separate dataframe
helios_gpu = get_dataframe("helios_gpu")
rci_2D = get_dataframe("rci_2D")
@@ -374,3 +451,21 @@ print()
print("GPU")
print(df_gpu)
make_latex_table(df_gpu, "gpu")

# TODO: 384 (16 nodes) instead of 336 (14 nodes)
df_portions_tnl = make_table_portions(df, "cube1m_5", [12, 24, 48, 96, 192, 288, 336])
print()
print("TNL portions")
print(df_portions_tnl)
df_portions_tnl = df_portions_tnl.applymap(lambda v: v.replace(" %", r"\thinspace\%"))
plt.write_latex(df_portions_tnl, "portions_tnl.tex", header_in_math=False, hide_nans=True,
                index_column_types="l", column_types="r" * len(df_portions_tnl.columns),
                sparsify_index=False, sparsify_header=False)
df_portions_hypre = make_table_portions(df_hypre_cpu, "cube1m_5", [12, 24, 48, 96, 192, 288, 336])
print()
print("Hypre portions")
print(df_portions_hypre)
df_portions_hypre = df_portions_hypre.applymap(lambda v: v.replace(" %", r"\thinspace\%"))
plt.write_latex(df_portions_hypre, "portions_hypre.tex", header_in_math=False, hide_nans=True,
                index_column_types="l", column_types="r" * len(df_portions_hypre.columns),
                sparsify_index=False, sparsify_header=False)

data/mcwhdd/portions.ods

deleted100644 → 0
−15.5 KiB

File deleted.

+59 −0
Original line number Diff line number Diff line


% The table needs the following to be defined in the preamble:
%   
%   \usepackage{booktabs}
%   \usepackage{multirow}
%   \usepackage{adjustbox}
%   \usepackage{stackengine}
%   \usepackage[np]{numprint}

\begin{tabular}{lrrrrrrr}
\toprule

% header row 0
\multicolumn{1}{l}{Number of MPI ranks}
  &  \multicolumn{1}{c}{\np{12}}
  &  \multicolumn{1}{c}{\np{24}}
  &  \multicolumn{1}{c}{\np{48}}
  &  \multicolumn{1}{c}{\np{96}}
  &  \multicolumn{1}{c}{\np{192}}
  &  \multicolumn{1}{c}{\np{288}}
  &  \multicolumn{1}{c}{\np{336}}
  \\

\midrule


        MHFEM routines
          &  

4.8\thinspace\%  &  4.6\thinspace\%  &  4.6\thinspace\%  &  4.8\thinspace\%  &  4.6\thinspace\%  &  4.3\thinspace\%  &  4.1\thinspace\% \\

        MPI communication of mesh data
          &  

0.1\thinspace\%  &  0.2\thinspace\%  &  0.3\thinspace\%  &  0.3\thinspace\%  &  0.4\thinspace\%  &  1.0\thinspace\%  &  1.1\thinspace\% \\

        Sparse matrix assembly
          &  

3.5\thinspace\%  &  3.2\thinspace\%  &  3.1\thinspace\%  &  3.2\thinspace\%  &  2.9\thinspace\%  &  2.6\thinspace\%  &  2.4\thinspace\% \\

        BoomerAMG update
          &  

0.4\thinspace\%  &  0.4\thinspace\%  &  0.4\thinspace\%  &  0.5\thinspace\%  &  0.5\thinspace\%  &  0.8\thinspace\%  &  0.8\thinspace\% \\

        Linear system solver
          &  

91.1\thinspace\%  &  91.5\thinspace\%  &  91.6\thinspace\%  &  91.2\thinspace\%  &  91.5\thinspace\%  &  91.3\thinspace\%  &  91.6\thinspace\% \\

        Total
          &  

4709 s  &  2485 s  &  1249 s  &  592 s  &  311 s  &  224 s  &  202 s \\

\bottomrule
\end{tabular}
+59 −0
Original line number Diff line number Diff line


% The table needs the following to be defined in the preamble:
%   
%   \usepackage{booktabs}
%   \usepackage{multirow}
%   \usepackage{adjustbox}
%   \usepackage{stackengine}
%   \usepackage[np]{numprint}

\begin{tabular}{lrrrrrrr}
\toprule

% header row 0
\multicolumn{1}{l}{Number of MPI ranks}
  &  \multicolumn{1}{c}{\np{12}}
  &  \multicolumn{1}{c}{\np{24}}
  &  \multicolumn{1}{c}{\np{48}}
  &  \multicolumn{1}{c}{\np{96}}
  &  \multicolumn{1}{c}{\np{192}}
  &  \multicolumn{1}{c}{\np{288}}
  &  \multicolumn{1}{c}{\np{336}}
  \\

\midrule


        MHFEM routines
          &  

0.9\thinspace\%  &  0.9\thinspace\%  &  1.0\thinspace\%  &  1.0\thinspace\%  &  1.1\thinspace\%  &  1.1\thinspace\%  &  1.0\thinspace\% \\

        MPI communication of mesh data
          &  

0.0\thinspace\%  &  0.0\thinspace\%  &  0.1\thinspace\%  &  0.1\thinspace\%  &  0.1\thinspace\%  &  0.2\thinspace\%  &  0.2\thinspace\% \\

        Sparse matrix assembly
          &  

0.7\thinspace\%  &  0.6\thinspace\%  &  0.6\thinspace\%  &  0.6\thinspace\%  &  0.6\thinspace\%  &  0.6\thinspace\%  &  0.5\thinspace\% \\

        Linear system solver (compute)
          &  

94.1\thinspace\%  &  90.0\thinspace\%  &  89.6\thinspace\%  &  85.6\thinspace\%  &  80.1\thinspace\%  &  74.3\thinspace\%  &  65.0\thinspace\% \\

        Linear system solver (MPI sync.)
          &  

4.3\thinspace\%  &  8.4\thinspace\%  &  8.7\thinspace\%  &  12.7\thinspace\%  &  18.1\thinspace\%  &  23.8\thinspace\%  &  33.3\thinspace\% \\

        Total
          &  

23839 s  &  12184 s  &  6029 s  &  2974 s  &  1380 s  &  893 s  &  852 s \\

\bottomrule
\end{tabular}
Loading