LBM section - added tables with GLUPS (df75b6e0) · Commits · Jakub Klinkovský / Dissertation

content/LBM.tex

+44 −2

Original line number	Diff line number	Diff line
		@@ -587,5 +587,47 @@ The algorithm tries to maximize the number of threads in the block up to \ic{max

		\subsection{Results}

		\inline{Comparison of some streaming patterns: our basic A-B pull-scheme vs A-A pattern}
		\inline{describe the benchmark problem}
		\inline{define GLUPS}
		\inline{describe the decompositions used in the benchmark: always $N_{\mathrm{proc}} = N_{\mathrm{part}}$}
		\inline{define rounding to the nearest integer: $\round{\cdot}$}

		\inline{Comparison of some streaming patterns: our A-B pull-scheme vs A-A pattern: \cref{tab:lbm:AB vs AA}}
		\begin{table}[tb]
		\caption{
		Performance comparison of the A-B pull scheme with the A-A streaming pattern in single and double precision on various Nvidia GPU architectures.
		The lattice size is $256 \times 256 \times 256$ in all cases.
		}
		\label{tab:lbm:AB vs AA}
		\centering
		\input{./data/lbm/AB_vs_AA/table.tex}
		\end{table}

		\inline{weak scaling and strong scaling on the Karolina supercomputer}
		\begin{table}[tb]
		\caption{
		Strong scaling in single and double precision on the Karolina supercomputer.
		The lattice size is $512 \times 512 \times 512$ in all cases.
		}
		\label{tab:lbm:strong scaling}
		\centering
		\input{data/lbm/scaling_on_Karolina/strong_scaling.tex}
		\end{table}
		\begin{table}[tb]
		\caption{
		Weak scaling with 1D domain expansion in single and double precision on the Karolina supercomputer.
		The lattice size is scaled as $256 \, N_{\mathrm{ranks}} \times 256 \times 256$.
		}
		\label{tab:lbm:weak scaling 1D}
		\centering
		\input{data/lbm/scaling_on_Karolina/weak_scaling_1D.tex}
		\end{table}
		\begin{table}[tb]
		\caption{
		Weak scaling with 3D domain expansion in single and double precision on the Karolina supercomputer.
		The lattice size is scaled as $N_x = N_y = N_z = 32 \, \round{16 \cbrt{N_{\mathrm{ranks}}}}$ in single precision and $N_x = N_y = N_z = 32 \, \round{8 \cbrt{N_{\mathrm{ranks}}}}$ in double precision.
		}
		\label{tab:lbm:weak scaling 3D}
		\centering
		\input{data/lbm/scaling_on_Karolina/weak_scaling_3D.tex}
		\end{table}
		No newline at end of file

data/lbm/AB_vs_AA/GeForce GTX 1080 Ti/AA_DP.log

0 → 100644

+103 −0

Original line number	Diff line number	Diff line
		Rank 0: rank on node is 0, using GPU id 0 of 2, CUDA_VISIBLE_DEVICES=
		CUDA block size optimizer: using block size [ 1, 128, 1 ] for subdomain size [ 128, 128, 128 ]
		Rank 0 uses GPU id 0: NVIDIA GeForce GTX 1080 Ti
		Local memory budget analysis / estimation for MPI rank 0
		CPU RAM for DFs: 432 MiB
		CPU RAM for map: 4 MiB
		CPU RAM for macro: 64 MiB
		TOTAL CPU RAM 500 MiB estimated needed, 64242 MiB available (0.7783%)
		GPU RAM for DFs: 432 MiB
		GPU RAM for map: 4 MiB
		GPU RAM for macro: 64 MiB
		TOTAL GPU RAM 500 MiB estimated needed, 11034 MiB available (4.5312%), total GPU RAM: 11176 MiB
		PHYS_DL = 1.984127e-03
		in lbm units: forcing=1.000000e-04
		MPI info: rank=0, nproc=1, lat.global=[128,128,128]
		LBM block 0: local=[128,128,128], offset=[0,0,0]

		START: simulation NSE:CUM lbmVisc 4.000000e-02 physDl 1.984127e-03 physDt 1.049803e-02
		Rank 0 MPI synchronization stats (last iteration):
		sent 0.0023593 GB in 18 messages, received 0.0023593 GB in 18 messages, in 0.002844 seconds
		bandwidth: unidirectional 0.829568 GB/s, bidirectional 1.65914 GB/s
		at t=0.00s, iterations=0 l1error_phys=2.288207e-04 l2error_phys=2.149036e-03 stopping=4.369234e+03
		at t=10.00s, iterations=953 l1error_phys=8.274164e-05 l2error_phys=7.972284e-04 stopping=1.087651e+04
		GLUPS=0.197 iter=953 t=10.005s dt=1.05e-02 lbmVisc=4.00e-02 WT=10s ETA=91s
		at t=20.01s, iterations=1906 l1error_phys=7.847638e-05 l2error_phys=6.985870e-04 stopping=1.019355e+04
		GLUPS=0.206 iter=1906 t=20.009s dt=1.05e-02 lbmVisc=4.00e-02 WT=20s ETA=79s
		at t=30.00s, iterations=2858 l1error_phys=7.840769e-05 l2error_phys=6.997345e-04 stopping=8.927193e+03
		GLUPS=0.205 iter=2858 t=30.003s dt=1.05e-02 lbmVisc=4.00e-02 WT=30s ETA=69s
		at t=40.01s, iterations=3811 l1error_phys=7.840212e-05 l2error_phys=6.995974e-04 stopping=7.652452e+03
		GLUPS=0.205 iter=3811 t=40.008s dt=1.05e-02 lbmVisc=4.00e-02 WT=39s ETA=59s
		at t=50.00s, iterations=4763 l1error_phys=7.840092e-05 l2error_phys=6.995902e-04 stopping=6.377174e+03
		GLUPS=0.204 iter=4763 t=50.002s dt=1.05e-02 lbmVisc=4.00e-02 WT=49s ETA=49s
		at t=60.01s, iterations=5716 l1error_phys=7.840084e-05 l2error_phys=6.995893e-04 stopping=5.101784e+03
		GLUPS=0.196 iter=5716 t=60.007s dt=1.05e-02 lbmVisc=4.00e-02 WT=59s ETA=40s
		at t=70.00s, iterations=6668 l1error_phys=7.840082e-05 l2error_phys=6.995891e-04 stopping=3.826388e+03
		GLUPS=0.189 iter=6668 t=70.001s dt=1.05e-02 lbmVisc=4.00e-02 WT=70s ETA=30s
		at t=80.01s, iterations=7621 l1error_phys=7.840082e-05 l2error_phys=6.995891e-04 stopping=2.550991e+03
		GLUPS=0.186 iter=7621 t=80.005s dt=1.05e-02 lbmVisc=4.00e-02 WT=81s ETA=20s
		at t=90.01s, iterations=8574 l1error_phys=7.840082e-05 l2error_phys=6.995891e-04 stopping=1.275594e+03
		GLUPS=0.182 iter=8574 t=90.010s dt=1.05e-02 lbmVisc=4.00e-02 WT=92s ETA=10s
		at t=100.00s, iterations=9526 l1error_phys=7.840082e-05 l2error_phys=6.995891e-04 stopping=1.975037e-01
		GLUPS=0.182 iter=9526 t=100.004s dt=1.05e-02 lbmVisc=4.00e-02 WT=103s ETA=-0s
		physFinalTime reached
		total walltime: 102.6 s, SimInit time: 0.4 s, SimUpdate time: 102.1 s, AfterSimUpdate time: 0.1 s
		compute time: 102.0 s, compute overlaps time: 0.0 s, wait for communication time: 0.0 s, wait for computation time: 0.0 s
		final GLUPS: average (based on SimInit + SimUpdate + AfterSimUpdate time): 0.196, based on compute time: 0.196
		CUDA block size optimizer: using block size [ 1, 128, 1 ] for subdomain size [ 256, 256, 256 ]
		Rank 0 uses GPU id 0: NVIDIA GeForce GTX 1080 Ti
		Local memory budget analysis / estimation for MPI rank 0
		CPU RAM for DFs: 3456 MiB
		CPU RAM for map: 32 MiB
		CPU RAM for macro: 512 MiB
		TOTAL CPU RAM 4000 MiB estimated needed, 64242 MiB available (6.2264%)
		GPU RAM for DFs: 3456 MiB
		GPU RAM for map: 32 MiB
		GPU RAM for macro: 512 MiB
		TOTAL GPU RAM 4000 MiB estimated needed, 11034 MiB available (36.2497%), total GPU RAM: 11176 MiB
		PHYS_DL = 9.842520e-04
		in lbm units: forcing=1.000000e-04
		MPI info: rank=0, nproc=1, lat.global=[256,256,256]
		LBM block 0: local=[256,256,256], offset=[0,0,0]

		START: simulation NSE:CUM lbmVisc 8.000000e-02 physDl 9.842520e-04 physDt 5.166677e-03
		Rank 0 MPI synchronization stats (last iteration):
		sent 0.00943718 GB in 18 messages, received 0.00943718 GB in 18 messages, in 0.0104524 seconds
		bandwidth: unidirectional 0.902868 GB/s, bidirectional 1.80574 GB/s
		at t=0.00s, iterations=0 l1error_phys=2.288078e-04 l2error_phys=2.149036e-03 stopping=4.369480e+03
		at t=10.00s, iterations=1936 l1error_phys=8.316932e-05 l2error_phys=8.011302e-04 stopping=1.082057e+04
		GLUPS=0.173 iter=1936 t=10.003s dt=5.17e-03 lbmVisc=8.00e-02 WT=188s ETA=1693s
		at t=20.00s, iterations=3871 l1error_phys=7.824425e-05 l2error_phys=6.966192e-04 stopping=1.022379e+04
		GLUPS=0.174 iter=3871 t=20.000s dt=5.17e-03 lbmVisc=8.00e-02 WT=375s ETA=1500s
		at t=30.00s, iterations=5807 l1error_phys=7.815947e-05 l2error_phys=6.978556e-04 stopping=8.955548e+03
		GLUPS=0.173 iter=5807 t=30.003s dt=5.17e-03 lbmVisc=8.00e-02 WT=563s ETA=1314s
		at t=40.00s, iterations=7742 l1error_phys=7.815431e-05 l2error_phys=6.977104e-04 stopping=7.676720e+03
		GLUPS=0.173 iter=7742 t=40.000s dt=5.17e-03 lbmVisc=8.00e-02 WT=751s ETA=1127s
		at t=50.00s, iterations=9678 l1error_phys=7.815259e-05 l2error_phys=6.976981e-04 stopping=6.397440e+03
		GLUPS=0.172 iter=9678 t=50.003s dt=5.17e-03 lbmVisc=8.00e-02 WT=940s ETA=939s
		at t=60.00s, iterations=11613 l1error_phys=7.815246e-05 l2error_phys=6.976969e-04 stopping=5.118000e+03
		GLUPS=0.172 iter=11613 t=60.001s dt=5.17e-03 lbmVisc=8.00e-02 WT=1128s ETA=752s
		at t=70.00s, iterations=13549 l1error_phys=7.815243e-05 l2error_phys=6.976965e-04 stopping=3.838552e+03
		GLUPS=0.172 iter=13549 t=70.003s dt=5.17e-03 lbmVisc=8.00e-02 WT=1317s ETA=564s
		at t=80.00s, iterations=15484 l1error_phys=7.815242e-05 l2error_phys=6.976965e-04 stopping=2.559101e+03
		GLUPS=0.172 iter=15484 t=80.001s dt=5.17e-03 lbmVisc=8.00e-02 WT=1506s ETA=376s
		at t=90.00s, iterations=17420 l1error_phys=7.815242e-05 l2error_phys=6.976965e-04 stopping=1.279650e+03
		GLUPS=0.172 iter=17420 t=90.004s dt=5.17e-03 lbmVisc=8.00e-02 WT=1695s ETA=188s
		at t=100.00s, iterations=19355 l1error_phys=7.815242e-05 l2error_phys=6.976965e-04 stopping=1.993199e-01
		GLUPS=0.172 iter=19355 t=100.001s dt=5.17e-03 lbmVisc=8.00e-02 WT=1883s ETA=-0s
		physFinalTime reached
		total walltime: 1883.2 s, SimInit time: 2.8 s, SimUpdate time: 1879.4 s, AfterSimUpdate time: 0.7 s
		compute time: 1878.5 s, compute overlaps time: 0.0 s, wait for communication time: 0.0 s, wait for computation time: 0.0 s
		final GLUPS: average (based on SimInit + SimUpdate + AfterSimUpdate time): 0.173, based on compute time: 0.173
		CUDA block size optimizer: using block size [ 1, 128, 1 ] for subdomain size [ 512, 512, 512 ]
		Rank 0 uses GPU id 0: NVIDIA GeForce GTX 1080 Ti
		Local memory budget analysis / estimation for MPI rank 0
		CPU RAM for DFs: 27648 MiB
		CPU RAM for map: 256 MiB
		CPU RAM for macro: 4096 MiB
		TOTAL CPU RAM 32000 MiB estimated needed, 64242 MiB available (49.8111%)
		GPU RAM for DFs: 27648 MiB
		GPU RAM for map: 256 MiB
		GPU RAM for macro: 4096 MiB
		TOTAL GPU RAM 32000 MiB estimated needed, 11034 MiB available (289.9979%), total GPU RAM: 11176 MiB
		Not enough memory available (CPU or GPU). [disable this check in lbm3d/state.h -> State constructor]

data/lbm/AB_vs_AA/GeForce GTX 1080 Ti/AA_SP.log

0 → 100644

+103 −0

Original line number	Diff line number	Diff line
		Rank 0: rank on node is 0, using GPU id 0 of 2, CUDA_VISIBLE_DEVICES=
		CUDA block size optimizer: using block size [ 1, 128, 2 ] for subdomain size [ 128, 128, 128 ]
		Rank 0 uses GPU id 0: NVIDIA GeForce GTX 1080 Ti
		Local memory budget analysis / estimation for MPI rank 0
		CPU RAM for DFs: 216 MiB
		CPU RAM for map: 4 MiB
		CPU RAM for macro: 32 MiB
		TOTAL CPU RAM 252 MiB estimated needed, 64242 MiB available (0.3923%)
		GPU RAM for DFs: 216 MiB
		GPU RAM for map: 4 MiB
		GPU RAM for macro: 32 MiB
		TOTAL GPU RAM 252 MiB estimated needed, 11034 MiB available (2.2837%), total GPU RAM: 11176 MiB
		PHYS_DL = 1.984127e-03
		in lbm units: forcing=1.000000e-04
		MPI info: rank=0, nproc=1, lat.global=[128,128,128]
		LBM block 0: local=[128,128,128], offset=[0,0,0]

		START: simulation NSE:CUM lbmVisc 4.000000e-02 physDl 1.984127e-03 physDt 1.049803e-02
		Rank 0 MPI synchronization stats (last iteration):
		sent 0.00117965 GB in 18 messages, received 0.00117965 GB in 18 messages, in 0.00160096 seconds
		bandwidth: unidirectional 0.736837 GB/s, bidirectional 1.47367 GB/s
		at t=0.00s, iterations=0 l1error_phys=2.288207e-04 l2error_phys=2.149036e-03 stopping=4.369234e+03
		at t=10.00s, iterations=953 l1error_phys=8.274176e-05 l2error_phys=7.972299e-04 stopping=1.087649e+04
		GLUPS=1.168 iter=953 t=10.005s dt=1.05e-02 lbmVisc=4.00e-02 WT=2s ETA=15s
		at t=20.01s, iterations=1906 l1error_phys=7.847617e-05 l2error_phys=6.985830e-04 stopping=1.019357e+04
		GLUPS=1.407 iter=1906 t=20.009s dt=1.05e-02 lbmVisc=4.00e-02 WT=3s ETA=13s
		at t=30.00s, iterations=2858 l1error_phys=7.840743e-05 l2error_phys=6.997295e-04 stopping=8.927223e+03
		GLUPS=1.407 iter=2858 t=30.003s dt=1.05e-02 lbmVisc=4.00e-02 WT=5s ETA=11s
		at t=40.01s, iterations=3811 l1error_phys=7.840188e-05 l2error_phys=6.995926e-04 stopping=7.652475e+03
		GLUPS=1.408 iter=3811 t=40.008s dt=1.05e-02 lbmVisc=4.00e-02 WT=6s ETA=9s
		at t=50.00s, iterations=4763 l1error_phys=7.840067e-05 l2error_phys=6.995855e-04 stopping=6.377194e+03
		GLUPS=1.409 iter=4763 t=50.002s dt=1.05e-02 lbmVisc=4.00e-02 WT=7s ETA=7s
		at t=60.01s, iterations=5716 l1error_phys=7.840060e-05 l2error_phys=6.995845e-04 stopping=5.101799e+03
		GLUPS=1.409 iter=5716 t=60.007s dt=1.05e-02 lbmVisc=4.00e-02 WT=9s ETA=6s
		at t=70.00s, iterations=6668 l1error_phys=7.840058e-05 l2error_phys=6.995844e-04 stopping=3.826400e+03
		GLUPS=1.409 iter=6668 t=70.001s dt=1.05e-02 lbmVisc=4.00e-02 WT=10s ETA=4s
		at t=80.01s, iterations=7621 l1error_phys=7.840058e-05 l2error_phys=6.995843e-04 stopping=2.550999e+03
		GLUPS=1.408 iter=7621 t=80.005s dt=1.05e-02 lbmVisc=4.00e-02 WT=12s ETA=3s
		at t=90.01s, iterations=8574 l1error_phys=7.840058e-05 l2error_phys=6.995843e-04 stopping=1.275598e+03
		GLUPS=1.409 iter=8574 t=90.010s dt=1.05e-02 lbmVisc=4.00e-02 WT=13s ETA=1s
		at t=100.00s, iterations=9526 l1error_phys=7.840058e-05 l2error_phys=6.995843e-04 stopping=1.975051e-01
		GLUPS=1.409 iter=9526 t=100.004s dt=1.05e-02 lbmVisc=4.00e-02 WT=14s ETA=-0s
		physFinalTime reached
		total walltime: 14.5 s, SimInit time: 0.3 s, SimUpdate time: 14.1 s, AfterSimUpdate time: 0.1 s
		compute time: 14.0 s, compute overlaps time: 0.0 s, wait for communication time: 0.0 s, wait for computation time: 0.0 s
		final GLUPS: average (based on SimInit + SimUpdate + AfterSimUpdate time): 1.415, based on compute time: 1.428
		CUDA block size optimizer: using block size [ 1, 256, 1 ] for subdomain size [ 256, 256, 256 ]
		Rank 0 uses GPU id 0: NVIDIA GeForce GTX 1080 Ti
		Local memory budget analysis / estimation for MPI rank 0
		CPU RAM for DFs: 1728 MiB
		CPU RAM for map: 32 MiB
		CPU RAM for macro: 256 MiB
		TOTAL CPU RAM 2016 MiB estimated needed, 64242 MiB available (3.1381%)
		GPU RAM for DFs: 1728 MiB
		GPU RAM for map: 32 MiB
		GPU RAM for macro: 256 MiB
		TOTAL GPU RAM 2016 MiB estimated needed, 11034 MiB available (18.2699%), total GPU RAM: 11176 MiB
		PHYS_DL = 9.842520e-04
		in lbm units: forcing=1.000000e-04
		MPI info: rank=0, nproc=1, lat.global=[256,256,256]
		LBM block 0: local=[256,256,256], offset=[0,0,0]

		START: simulation NSE:CUM lbmVisc 8.000000e-02 physDl 9.842520e-04 physDt 5.166677e-03
		Rank 0 MPI synchronization stats (last iteration):
		sent 0.00471859 GB in 18 messages, received 0.00471859 GB in 18 messages, in 0.00518429 seconds
		bandwidth: unidirectional 0.910172 GB/s, bidirectional 1.82034 GB/s
		at t=0.00s, iterations=0 l1error_phys=2.288078e-04 l2error_phys=2.149036e-03 stopping=4.369480e+03
		at t=10.00s, iterations=1936 l1error_phys=8.316908e-05 l2error_phys=8.011306e-04 stopping=1.082060e+04
		GLUPS=1.309 iter=1936 t=10.003s dt=5.17e-03 lbmVisc=8.00e-02 WT=25s ETA=223s
		at t=20.00s, iterations=3871 l1error_phys=7.824463e-05 l2error_phys=6.966187e-04 stopping=1.022374e+04
		GLUPS=1.441 iter=3871 t=20.000s dt=5.17e-03 lbmVisc=8.00e-02 WT=47s ETA=189s
		at t=30.00s, iterations=5807 l1error_phys=7.815992e-05 l2error_phys=6.978573e-04 stopping=8.955496e+03
		GLUPS=1.441 iter=5807 t=30.003s dt=5.17e-03 lbmVisc=8.00e-02 WT=70s ETA=163s
		at t=40.00s, iterations=7742 l1error_phys=7.815479e-05 l2error_phys=6.977119e-04 stopping=7.676672e+03
		GLUPS=1.443 iter=7742 t=40.000s dt=5.17e-03 lbmVisc=8.00e-02 WT=92s ETA=139s
		at t=50.00s, iterations=9678 l1error_phys=7.815302e-05 l2error_phys=6.976999e-04 stopping=6.397405e+03
		GLUPS=1.448 iter=9678 t=50.003s dt=5.17e-03 lbmVisc=8.00e-02 WT=115s ETA=115s
		at t=60.00s, iterations=11613 l1error_phys=7.815291e-05 l2error_phys=6.976984e-04 stopping=5.117971e+03
		GLUPS=1.450 iter=11613 t=60.001s dt=5.17e-03 lbmVisc=8.00e-02 WT=137s ETA=91s
		at t=70.00s, iterations=13549 l1error_phys=7.815287e-05 l2error_phys=6.976981e-04 stopping=3.838530e+03
		GLUPS=1.450 iter=13549 t=70.003s dt=5.17e-03 lbmVisc=8.00e-02 WT=160s ETA=68s
		at t=80.00s, iterations=15484 l1error_phys=7.815286e-05 l2error_phys=6.976980e-04 stopping=2.559087e+03
		GLUPS=1.450 iter=15484 t=80.001s dt=5.17e-03 lbmVisc=8.00e-02 WT=182s ETA=45s
		at t=90.00s, iterations=17420 l1error_phys=7.815286e-05 l2error_phys=6.976980e-04 stopping=1.279643e+03
		GLUPS=1.450 iter=17420 t=90.004s dt=5.17e-03 lbmVisc=8.00e-02 WT=204s ETA=23s
		at t=100.00s, iterations=19355 l1error_phys=7.815286e-05 l2error_phys=6.976980e-04 stopping=1.993173e-01
		GLUPS=1.450 iter=19355 t=100.001s dt=5.17e-03 lbmVisc=8.00e-02 WT=227s ETA=-0s
		physFinalTime reached
		total walltime: 226.8 s, SimInit time: 2.1 s, SimUpdate time: 223.9 s, AfterSimUpdate time: 0.4 s
		compute time: 223.5 s, compute overlaps time: 0.0 s, wait for communication time: 0.0 s, wait for computation time: 0.0 s
		final GLUPS: average (based on SimInit + SimUpdate + AfterSimUpdate time): 1.447, based on compute time: 1.453
		CUDA block size optimizer: using block size [ 1, 256, 1 ] for subdomain size [ 512, 512, 512 ]
		Rank 0 uses GPU id 0: NVIDIA GeForce GTX 1080 Ti
		Local memory budget analysis / estimation for MPI rank 0
		CPU RAM for DFs: 13824 MiB
		CPU RAM for map: 256 MiB
		CPU RAM for macro: 2048 MiB
		TOTAL CPU RAM 16128 MiB estimated needed, 64242 MiB available (25.1048%)
		GPU RAM for DFs: 13824 MiB
		GPU RAM for map: 256 MiB
		GPU RAM for macro: 2048 MiB
		TOTAL GPU RAM 16128 MiB estimated needed, 11034 MiB available (146.1589%), total GPU RAM: 11176 MiB
		Not enough memory available (CPU or GPU). [disable this check in lbm3d/state.h -> State constructor]

data/lbm/AB_vs_AA/GeForce GTX 1080 Ti/AB_DP.log

0 → 100644

+103 −0

Original line number	Diff line number	Diff line
		Rank 0: rank on node is 0, using GPU id 0 of 2, CUDA_VISIBLE_DEVICES=
		CUDA block size optimizer: using block size [ 1, 128, 1 ] for subdomain size [ 128, 128, 128 ]
		Rank 0 uses GPU id 0: NVIDIA GeForce GTX 1080 Ti
		Local memory budget analysis / estimation for MPI rank 0
		CPU RAM for DFs: 864 MiB
		CPU RAM for map: 4 MiB
		CPU RAM for macro: 64 MiB
		TOTAL CPU RAM 932 MiB estimated needed, 64242 MiB available (1.4507%)
		GPU RAM for DFs: 864 MiB
		GPU RAM for map: 4 MiB
		GPU RAM for macro: 64 MiB
		TOTAL GPU RAM 932 MiB estimated needed, 11034 MiB available (8.4462%), total GPU RAM: 11176 MiB
		PHYS_DL = 1.984127e-03
		in lbm units: forcing=1.000000e-04
		MPI info: rank=0, nproc=1, lat.global=[128,128,128]
		LBM block 0: local=[128,128,128], offset=[0,0,0]

		START: simulation NSE:CUM lbmVisc 4.000000e-02 physDl 1.984127e-03 physDt 1.049803e-02
		Rank 0 MPI synchronization stats (last iteration):
		sent 0.0023593 GB in 18 messages, received 0.0023593 GB in 18 messages, in 0.00284351 seconds
		bandwidth: unidirectional 0.829712 GB/s, bidirectional 1.65942 GB/s
		at t=0.00s, iterations=0 l1error_phys=2.288207e-04 l2error_phys=2.149036e-03 stopping=4.369234e+03
		at t=10.00s, iterations=953 l1error_phys=6.545574e-05 l2error_phys=6.410678e-04 stopping=1.374910e+04
		GLUPS=0.192 iter=953 t=10.005s dt=1.05e-02 lbmVisc=4.00e-02 WT=10s ETA=94s
		at t=20.01s, iterations=1906 l1error_phys=5.836361e-05 l2error_phys=5.264212e-04 stopping=1.370668e+04
		GLUPS=0.205 iter=1906 t=20.009s dt=1.05e-02 lbmVisc=4.00e-02 WT=20s ETA=81s
		at t=30.00s, iterations=2858 l1error_phys=5.834354e-05 l2error_phys=5.275025e-04 stopping=1.199751e+04
		GLUPS=0.205 iter=2858 t=30.003s dt=1.05e-02 lbmVisc=4.00e-02 WT=30s ETA=70s
		at t=40.01s, iterations=3811 l1error_phys=5.833265e-05 l2error_phys=5.273680e-04 stopping=1.028554e+04
		GLUPS=0.204 iter=3811 t=40.008s dt=1.05e-02 lbmVisc=4.00e-02 WT=40s ETA=59s
		at t=50.00s, iterations=4763 l1error_phys=5.833230e-05 l2error_phys=5.273669e-04 stopping=8.571385e+03
		GLUPS=0.193 iter=4763 t=50.002s dt=1.05e-02 lbmVisc=4.00e-02 WT=50s ETA=50s
		at t=60.01s, iterations=5716 l1error_phys=5.833224e-05 l2error_phys=5.273666e-04 stopping=6.857175e+03
		GLUPS=0.185 iter=5716 t=60.007s dt=1.05e-02 lbmVisc=4.00e-02 WT=61s ETA=41s
		at t=70.00s, iterations=6668 l1error_phys=5.833224e-05 l2error_phys=5.273666e-04 stopping=5.142958e+03
		GLUPS=0.181 iter=6668 t=70.001s dt=1.05e-02 lbmVisc=4.00e-02 WT=72s ETA=31s
		at t=80.01s, iterations=7621 l1error_phys=5.833224e-05 l2error_phys=5.273666e-04 stopping=3.428740e+03
		GLUPS=0.178 iter=7621 t=80.005s dt=1.05e-02 lbmVisc=4.00e-02 WT=83s ETA=21s
		at t=90.01s, iterations=8574 l1error_phys=5.833224e-05 l2error_phys=5.273666e-04 stopping=1.714523e+03
		GLUPS=0.177 iter=8574 t=90.010s dt=1.05e-02 lbmVisc=4.00e-02 WT=94s ETA=10s
		at t=100.00s, iterations=9526 l1error_phys=5.833224e-05 l2error_phys=5.273666e-04 stopping=3.045573e-01
		GLUPS=0.177 iter=9526 t=100.004s dt=1.05e-02 lbmVisc=4.00e-02 WT=106s ETA=-0s
		physFinalTime reached
		total walltime: 105.7 s, SimInit time: 0.6 s, SimUpdate time: 105.0 s, AfterSimUpdate time: 0.1 s
		compute time: 104.8 s, compute overlaps time: 0.0 s, wait for communication time: 0.0 s, wait for computation time: 0.0 s
		final GLUPS: average (based on SimInit + SimUpdate + AfterSimUpdate time): 0.190, based on compute time: 0.191
		CUDA block size optimizer: using block size [ 1, 128, 1 ] for subdomain size [ 256, 256, 256 ]
		Rank 0 uses GPU id 0: NVIDIA GeForce GTX 1080 Ti
		Local memory budget analysis / estimation for MPI rank 0
		CPU RAM for DFs: 6912 MiB
		CPU RAM for map: 32 MiB
		CPU RAM for macro: 512 MiB
		TOTAL CPU RAM 7456 MiB estimated needed, 64242 MiB available (11.6060%)
		GPU RAM for DFs: 6912 MiB
		GPU RAM for map: 32 MiB
		GPU RAM for macro: 512 MiB
		TOTAL GPU RAM 7456 MiB estimated needed, 11034 MiB available (67.5695%), total GPU RAM: 11176 MiB
		PHYS_DL = 9.842520e-04
		in lbm units: forcing=1.000000e-04
		MPI info: rank=0, nproc=1, lat.global=[256,256,256]
		LBM block 0: local=[256,256,256], offset=[0,0,0]

		START: simulation NSE:CUM lbmVisc 8.000000e-02 physDl 9.842520e-04 physDt 5.166677e-03
		Rank 0 MPI synchronization stats (last iteration):
		sent 0.00943718 GB in 18 messages, received 0.00943718 GB in 18 messages, in 0.0111821 seconds
		bandwidth: unidirectional 0.843955 GB/s, bidirectional 1.68791 GB/s
		at t=0.00s, iterations=0 l1error_phys=2.288078e-04 l2error_phys=2.149036e-03 stopping=4.369480e+03
		at t=10.00s, iterations=1936 l1error_phys=6.606405e-05 l2error_phys=6.471638e-04 stopping=1.362249e+04
		GLUPS=0.168 iter=1936 t=10.003s dt=5.17e-03 lbmVisc=8.00e-02 WT=193s ETA=1735s
		at t=20.00s, iterations=3871 l1error_phys=5.814369e-05 l2error_phys=5.258672e-04 stopping=1.375852e+04
		GLUPS=0.173 iter=3871 t=20.000s dt=5.17e-03 lbmVisc=8.00e-02 WT=381s ETA=1524s
		at t=30.00s, iterations=5807 l1error_phys=5.812542e-05 l2error_phys=5.271499e-04 stopping=1.204253e+04
		GLUPS=0.172 iter=5807 t=30.003s dt=5.17e-03 lbmVisc=8.00e-02 WT=569s ETA=1328s
		at t=40.00s, iterations=7742 l1error_phys=5.811243e-05 l2error_phys=5.269914e-04 stopping=1.032452e+04
		GLUPS=0.172 iter=7742 t=40.000s dt=5.17e-03 lbmVisc=8.00e-02 WT=757s ETA=1136s
		at t=50.00s, iterations=9678 l1error_phys=5.811186e-05 l2error_phys=5.269876e-04 stopping=8.603903e+03
		GLUPS=0.173 iter=9678 t=50.003s dt=5.17e-03 lbmVisc=8.00e-02 WT=946s ETA=946s
		at t=60.00s, iterations=11613 l1error_phys=5.811176e-05 l2error_phys=5.269872e-04 stopping=6.883196e+03
		GLUPS=0.173 iter=11613 t=60.001s dt=5.17e-03 lbmVisc=8.00e-02 WT=1134s ETA=756s
		at t=70.00s, iterations=13549 l1error_phys=5.811175e-05 l2error_phys=5.269871e-04 stopping=5.162475e+03
		GLUPS=0.173 iter=13549 t=70.003s dt=5.17e-03 lbmVisc=8.00e-02 WT=1322s ETA=567s
		at t=80.00s, iterations=15484 l1error_phys=5.811175e-05 l2error_phys=5.269871e-04 stopping=3.441753e+03
		GLUPS=0.173 iter=15484 t=80.001s dt=5.17e-03 lbmVisc=8.00e-02 WT=1510s ETA=378s
		at t=90.00s, iterations=17420 l1error_phys=5.811175e-05 l2error_phys=5.269871e-04 stopping=1.721030e+03
		GLUPS=0.173 iter=17420 t=90.004s dt=5.17e-03 lbmVisc=8.00e-02 WT=1698s ETA=189s
		at t=100.00s, iterations=19355 l1error_phys=5.811175e-05 l2error_phys=5.269871e-04 stopping=3.075021e-01
		GLUPS=0.173 iter=19355 t=100.001s dt=5.17e-03 lbmVisc=8.00e-02 WT=1886s ETA=-0s
		physFinalTime reached
		total walltime: 1886.3 s, SimInit time: 4.5 s, SimUpdate time: 1881.1 s, AfterSimUpdate time: 0.5 s
		compute time: 1880.2 s, compute overlaps time: 0.0 s, wait for communication time: 0.0 s, wait for computation time: 0.0 s
		final GLUPS: average (based on SimInit + SimUpdate + AfterSimUpdate time): 0.173, based on compute time: 0.173
		CUDA block size optimizer: using block size [ 1, 128, 1 ] for subdomain size [ 512, 512, 512 ]
		Rank 0 uses GPU id 0: NVIDIA GeForce GTX 1080 Ti
		Local memory budget analysis / estimation for MPI rank 0
		CPU RAM for DFs: 55296 MiB
		CPU RAM for map: 256 MiB
		CPU RAM for macro: 4096 MiB
		TOTAL CPU RAM 59648 MiB estimated needed, 64242 MiB available (92.8480%)
		GPU RAM for DFs: 55296 MiB
		GPU RAM for map: 256 MiB
		GPU RAM for macro: 4096 MiB
		TOTAL GPU RAM 59648 MiB estimated needed, 11034 MiB available (540.5561%), total GPU RAM: 11176 MiB
		Not enough memory available (CPU or GPU). [disable this check in lbm3d/state.h -> State constructor]

data/lbm/AB_vs_AA/GeForce GTX 1080 Ti/AB_SP.log

0 → 100644

+103 −0

File added.

Preview size limit exceeded, changes collapsed.