working on the MHFEM section - tables with CT portions showing TNL vs Hypre (064ee69e) · Commits · Jakub Klinkovský / Dissertation

content/MHFEM.tex

+15 −22

Original line number	Diff line number	Diff line
		@@ -103,7 +103,7 @@ Comparing the computational times on the mesh 3D$^\triangle_5$ from \cref{tab:co

		\begin{table}[!tb]
		\caption{
		Comparison of computational times $CT \, [\text{s}]$, speed-up $Sp$, and parallel efficiency $E\!f\!f$ of the OpenMP and MPI-based CPU computations for the generalized McWhorter--Sunada problem on the finest triangular mesh 2D$^\triangle_5$.
		Comparison of CPU computational times $CT \, [\text{s}]$, speed-up $Sp$, and parallel efficiency $E\!f\!f$ for the generalized McWhorter--Sunada problem computed on the finest triangular mesh 2D$^\triangle_5$.
		}
		\label{tab:mhfem:comptimes:CPU 2D}
		\centering
		@@ -112,7 +112,7 @@ Comparing the computational times on the mesh 3D$^\triangle_5$ from \cref{tab:co

		\begin{table}[!tb]
		\caption{
		Comparison of computational times $CT \, [\text{s}]$, speed-up $Sp$, and parallel efficiency $E\!f\!f$ of the OpenMP and MPI-based CPU computations for the generalized McWhorter--Sunada problem on the finest tetrahedral mesh 3D$^\triangle_5$.
		Comparison of CPU computational times $CT \, [\text{s}]$, speed-up $Sp$, and parallel efficiency $E\!f\!f$ for the generalized McWhorter--Sunada problem computed on the finest tetrahedral mesh 3D$^\triangle_5$.
		}
		\label{tab:mhfem:comptimes:CPU 3D}
		\centering
		@@ -131,8 +131,8 @@ The speed-ups could be improved by optimizing the linear system solver (BiCGstab

		\begin{table}[tb]
		\caption{
		Comparison of computational times $CT \, [\text{s}]$, speed-up $Sp$, and parallel efficiency $E\!f\!f$ of MPI-based GPU computations for the generalized McWhorter--Sunada problem on the finest triangular and tetrahedral meshes.
		Each rank manages its dedicated GPU.
		Comparison of GPU computational times $CT \, [\text{s}]$, speed-up $Sp$, and parallel efficiency $E\!f\!f$ for the generalized McWhorter--Sunada problem computed on the finest triangular and tetrahedral meshes.
		Each MPI rank manages its dedicated GPU.
		}
		\label{tab:mhfem:comptimes:GPU}
		\centering
		@@ -148,26 +148,19 @@ Before each sparse matrix--vector multiply, the ghost regions of the input vecto
		The communication for dot products is implemented using the \ic{MPI_Allreduce} function and its contribution to the computational time is included in the fifth row in \cref{tab:comptimes:portions}.
		The remaining operations, such as the sparse matrix assembly and various operations for the MHFEM discretization contribute only a small portion to the total computational time, which is even smaller than the communication time required for the linear system solver.

		\begin{table}[bth]
		\begin{table}[tb]
		\caption{
		Comparison of the portions contributing to the total computational times $CT \, [\text{s}]$ of MPI-based CPU computations for the generalized McWhorter--Sunada problem on the finest tetrahedral mesh 3D$^\triangle_5$.
		Comparison of the portions contributing to the total CPU computational times $CT \, [\text{s}]$ for the generalized McWhorter--Sunada problem computed on the finest tetrahedral mesh 3D$^\triangle_5$ using BiCGstab from TNL as the linear system solver.
		All values are average times of all MPI ranks.
		}
		\label{tab:comptimes:portions}
		\centering
		\scalebox{0.95}{
		\begin{tabular}{lrrrrrrr}
		\toprule
		Number of CPU cores & 12 & 24 & 48 & 96 & 192 & 288 & 384 \\
		\midrule
		MHFEM routines & 217.2 & 109.4 & 55.6 & 27.8 & 13.9 & 9.3 & 7.0 \\
		Sparse matrix assembly & 147.4 & 72.1 & 35.4 & 17.1 & 8.1 & 5.3 & 3.9 \\
		Linear system solver & 22608.9 & 11089.3 & 5500.3 & 2594.9 & 1101.1 & 680.9 & 473.1 \\
		Mesh data communication over MPI & 659.3 & 676.6 & 348.8 & 210.8 & 132.4 & 105.4 & 87.6 \\
		MPI collective operations (\ic{MPI_Allreduce}) & 316.7 & 308.1 & 231.2 & 165.4 & 154.9 & 123.3 & 130.9 \\
		\midrule
		Total & 23949.5 & 12255.5 & 6171.4 & 3016.0 & 1410.4 & 924.2 & 702.4 \\
		\bottomrule
		\end{tabular}
		\label{tab:mhfem:portions:TNL}
		\input{./data/mcwhdd/portions_tnl.tex}
		\end{table}
		\begin{table}[tb]
		\caption{
		Comparison of the portions contributing to the total CPU computational times $CT \, [\text{s}]$ for the generalized McWhorter--Sunada problem computed on the finest tetrahedral mesh 3D$^\triangle_5$ using BoomerAMG-preconditioned BiCGstab from the Hypre library as the linear system solver.
		All values are average times of all MPI ranks.
		}
		\label{tab:mhfem:portions:Hypre}
		\input{./data/mcwhdd/portions_hypre.tex}
		\end{table}
		No newline at end of file

data/mcwhdd/make_tables.py

+97 −2

Original line number	Diff line number	Diff line
		@@ -106,7 +106,14 @@ def parse_log(log: Path):
		if match:
		key = match.group("key")
		value = match.group("value")
		if value.strip():
		if not value.strip():
		continue

		# parse complicated fields
		match = re.search(r"avg: (?P<value>[\d.e+-]+) ", value)
		if match:
		value = match.group("value")

		result.setdefault(key, value)

		# add MPI processes from the file path (old logs don't have this field)
		@@ -328,6 +335,76 @@ def make_table_gpu(df):

		return df

		def make_table_portions(df, meshid, cores):
		# filter meshid
		df = df[df["meshid"].map(lambda x: x == meshid)]

		# filter cores
		df = df[df["MPI ranks"].map(lambda x: x in cores)]
		df["MPI ranks"] = df["MPI ranks"].map(int)

		# rename some columns
		df.rename(columns={
		"MPI ranks": "Number of MPI ranks",
		}, inplace=True)

		# set the index
		df.set_index("Number of MPI ranks", inplace=True)
		df.sort_index(inplace=True)

		# remove uninteresting columns
		del df["meshid"]
		del df["Host name"]
		assert (df["Device"] == "CPU").all()
		del df["Device"]
		assert (df["OMP enabled"] == "no").all()
		del df["OMP enabled"]
		assert (df["OMP threads"] == 1).all()
		del df["OMP threads"]

		# print(df.T)

		# create a new dataframe and add the combined times
		df2 = pandas.DataFrame()
		df2["MHFEM routines"] = df["Pre-iterate time"] + df["Post-iterate time"] - df["upwind MPI synchronization time"]
		df2["MPI communication of mesh data"] = df["upwind MPI synchronization time"]
		df2["Sparse matrix assembly"] = df["Linear system assembler time"]
		if "Hypre solve time" in df:
		df2["BoomerAMG update"] = df["Hypre setup time"]
		df2["Linear system solver"] = df["Hypre solve time"]
		if "Hypre result MPI synchronization time" in df:
		df2["MPI communication of mesh data"] += df["Hypre result MPI synchronization time"]
		compensate = None
		else:
		# missing entry in old logs - fortunately it is negligible
		compensate = "MPI communication of mesh data"
		else:
		# unify old and new logs
		df["faceSynchronizer async operations time"] = df["MPI synchronization time"].fillna(0) + df["faceSynchronizer async operations time"].fillna(0)

		df2["Linear system solver (compute)"] = df["Linear preconditioner update time"] + df["Linear system solver time"] - df["faceSynchronizer async operations time"] - df["MPI_Allreduce time"]
		# df2["Linear system solver (MPI communication)"] = df["faceSynchronizer async operations time"]
		# df2[r"Linear system solver (MPI all-reduction)"] = df["MPI_Allreduce time"]
		df2[r"Linear system solver (MPI sync.)"] = df["faceSynchronizer async operations time"] + df["MPI_Allreduce time"]
		# the difference is due to averaging over the MPI ranks - collective operations are pipelined...
		# compensate = "Linear system solver (MPI all-reduction)"
		compensate = "Linear system solver (MPI sync.)"
		df2["Total"] = df["Compute time"]
		# compensate the difference in total
		if compensate is not None:
		df2[compensate] += 2 * df2["Total"] - df2.sum(axis=1)
		# df2["Total (check)"] = df2.sum(axis=1) - df2["Total"]

		# convert to percentages
		for c in df2.columns:
		if c == "Total":
		# just round and add the unit
		df2[c] = df2[c].map(lambda v: "{:.0f} s".format(v))
		else:
		df2[c] = (df2[c] / df2["Total"] * 100).map(lambda v: "{:.1f} %".format(v))

		return df2.T

		# load each directory as separate dataframe
		helios_gpu = get_dataframe("helios_gpu")
		rci_2D = get_dataframe("rci_2D")
		@@ -374,3 +451,21 @@ print()
		print("GPU")
		print(df_gpu)
		make_latex_table(df_gpu, "gpu")

		# TODO: 384 (16 nodes) instead of 336 (14 nodes)
		df_portions_tnl = make_table_portions(df, "cube1m_5", [12, 24, 48, 96, 192, 288, 336])
		print()
		print("TNL portions")
		print(df_portions_tnl)
		df_portions_tnl = df_portions_tnl.applymap(lambda v: v.replace(" %", r"\thinspace\%"))
		plt.write_latex(df_portions_tnl, "portions_tnl.tex", header_in_math=False, hide_nans=True,
		index_column_types="l", column_types="r" * len(df_portions_tnl.columns),
		sparsify_index=False, sparsify_header=False)
		df_portions_hypre = make_table_portions(df_hypre_cpu, "cube1m_5", [12, 24, 48, 96, 192, 288, 336])
		print()
		print("Hypre portions")
		print(df_portions_hypre)
		df_portions_hypre = df_portions_hypre.applymap(lambda v: v.replace(" %", r"\thinspace\%"))
		plt.write_latex(df_portions_hypre, "portions_hypre.tex", header_in_math=False, hide_nans=True,
		index_column_types="l", column_types="r" * len(df_portions_hypre.columns),
		sparsify_index=False, sparsify_header=False)

data/mcwhdd/portions.ods

deleted100644 → 0

−15.5 KiB

File deleted.

View file

data/mcwhdd/portions_hypre.tex

0 → 100644

+59 −0

Original line number	Diff line number	Diff line


		% The table needs the following to be defined in the preamble:
		%
		% \usepackage{booktabs}
		% \usepackage{multirow}
		% \usepackage{adjustbox}
		% \usepackage{stackengine}
		% \usepackage[np]{numprint}

		\begin{tabular}{lrrrrrrr}
		\toprule

		% header row 0
		\multicolumn{1}{l}{Number of MPI ranks}
		& \multicolumn{1}{c}{\np{12}}
		& \multicolumn{1}{c}{\np{24}}
		& \multicolumn{1}{c}{\np{48}}
		& \multicolumn{1}{c}{\np{96}}
		& \multicolumn{1}{c}{\np{192}}
		& \multicolumn{1}{c}{\np{288}}
		& \multicolumn{1}{c}{\np{336}}
		\\

		\midrule


		MHFEM routines
		&

		4.8\thinspace\% & 4.6\thinspace\% & 4.6\thinspace\% & 4.8\thinspace\% & 4.6\thinspace\% & 4.3\thinspace\% & 4.1\thinspace\% \\

		MPI communication of mesh data
		&

		0.1\thinspace\% & 0.2\thinspace\% & 0.3\thinspace\% & 0.3\thinspace\% & 0.4\thinspace\% & 1.0\thinspace\% & 1.1\thinspace\% \\

		Sparse matrix assembly
		&

		3.5\thinspace\% & 3.2\thinspace\% & 3.1\thinspace\% & 3.2\thinspace\% & 2.9\thinspace\% & 2.6\thinspace\% & 2.4\thinspace\% \\

		BoomerAMG update
		&

		0.4\thinspace\% & 0.4\thinspace\% & 0.4\thinspace\% & 0.5\thinspace\% & 0.5\thinspace\% & 0.8\thinspace\% & 0.8\thinspace\% \\

		Linear system solver
		&

		91.1\thinspace\% & 91.5\thinspace\% & 91.6\thinspace\% & 91.2\thinspace\% & 91.5\thinspace\% & 91.3\thinspace\% & 91.6\thinspace\% \\

		Total
		&

		4709 s & 2485 s & 1249 s & 592 s & 311 s & 224 s & 202 s \\

		\bottomrule
		\end{tabular}

data/mcwhdd/portions_tnl.tex

0 → 100644

+59 −0

Original line number	Diff line number	Diff line


		% The table needs the following to be defined in the preamble:
		%
		% \usepackage{booktabs}
		% \usepackage{multirow}
		% \usepackage{adjustbox}
		% \usepackage{stackengine}
		% \usepackage[np]{numprint}

		\begin{tabular}{lrrrrrrr}
		\toprule

		% header row 0
		\multicolumn{1}{l}{Number of MPI ranks}
		& \multicolumn{1}{c}{\np{12}}
		& \multicolumn{1}{c}{\np{24}}
		& \multicolumn{1}{c}{\np{48}}
		& \multicolumn{1}{c}{\np{96}}
		& \multicolumn{1}{c}{\np{192}}
		& \multicolumn{1}{c}{\np{288}}
		& \multicolumn{1}{c}{\np{336}}
		\\

		\midrule


		MHFEM routines
		&

		0.9\thinspace\% & 0.9\thinspace\% & 1.0\thinspace\% & 1.0\thinspace\% & 1.1\thinspace\% & 1.1\thinspace\% & 1.0\thinspace\% \\

		MPI communication of mesh data
		&

		0.0\thinspace\% & 0.0\thinspace\% & 0.1\thinspace\% & 0.1\thinspace\% & 0.1\thinspace\% & 0.2\thinspace\% & 0.2\thinspace\% \\

		Sparse matrix assembly
		&

		0.7\thinspace\% & 0.6\thinspace\% & 0.6\thinspace\% & 0.6\thinspace\% & 0.6\thinspace\% & 0.6\thinspace\% & 0.5\thinspace\% \\

		Linear system solver (compute)
		&

		94.1\thinspace\% & 90.0\thinspace\% & 89.6\thinspace\% & 85.6\thinspace\% & 80.1\thinspace\% & 74.3\thinspace\% & 65.0\thinspace\% \\

		Linear system solver (MPI sync.)
		&

		4.3\thinspace\% & 8.4\thinspace\% & 8.7\thinspace\% & 12.7\thinspace\% & 18.1\thinspace\% & 23.8\thinspace\% & 33.3\thinspace\% \\

		Total
		&

		23839 s & 12184 s & 6029 s & 2974 s & 1380 s & 893 s & 852 s \\

		\bottomrule
		\end{tabular}