-
Notifications
You must be signed in to change notification settings - Fork 7
/
20141031-Structure.tex
516 lines (461 loc) · 20.3 KB
/
20141031-Structure.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
% \documentclass[handout]{beamer}
\documentclass{beamer}
\mode<presentation>
{
\usetheme{ANLBlue}
% \usefonttheme[onlymath]{serif}
% \usetheme{Singapore}
% \usetheme{Warsaw}
% \usetheme{Malmoe}
% \useinnertheme{circles}
% \useoutertheme{infolines}
% \useinnertheme{rounded}
\setbeamercovered{transparent=20}
}
\usepackage[english]{babel}
\usepackage[latin1]{inputenc}
\usepackage{alltt,listings,multirow,ulem,siunitx}
\usepackage[absolute,overlay]{textpos}
\TPGrid{1}{1}
\usepackage{pdfpages}
\usepackage{ulem}
\usepackage{multimedia}
\usepackage{multicol}
\newcommand\hmmax{0}
\newcommand\bmmax{0}
\usepackage{bm}
\usepackage{comment}
\usepackage{subcaption}
% font definitions, try \usepackage{ae} instead of the following
% three lines if you don't like this look
\usepackage{mathptmx}
\usepackage[scaled=.90]{helvet}
% \usepackage{courier}
\usepackage[T1]{fontenc}
\usepackage{tikz}
\usetikzlibrary{decorations.pathreplacing}
\usetikzlibrary{shadows,arrows,shapes.misc,shapes.arrows,shapes.multipart,arrows,decorations.pathmorphing,backgrounds,positioning,fit,petri,calc,shadows,chains,matrix}
\newcommand\vvec{\bm v}
\newcommand\bvec{\bm b}
\newcommand\bxk{\bvec_0 \times \kappa_0 \cdot \nabla}
\newcommand\delp{\nabla_\perp}
% \usepackage{pgfpages}
% \pgfpagesuselayout{4 on 1}[a4paper,landscape,border shrink=5mm]
\usepackage{JedMacros}
\newcommand{\timeR}{t_{\mathrm{R}}}
\newcommand{\timeW}{t_{\mathrm{W}}}
\newcommand{\mglevel}{\ensuremath{\ell}}
\newcommand{\mglevelcp}{\ensuremath{\mglevel_{\mathrm{cp}}}}
\newcommand{\mglevelcoarse}{\ensuremath{\mglevel_{\mathrm{coarse}}}}
\newcommand{\mglevelfine}{\ensuremath{\mglevel_{\mathrm{fine}}}}
%solution and residual
\newcommand{\vx}{\ensuremath{x}}
\newcommand{\vc}{\ensuremath{\hat{x}}}
\newcommand{\vr}{\ensuremath{r}}
\newcommand{\vb}{\ensuremath{b}}
%operators
\newcommand{\vA}{\ensuremath{A}}
\newcommand{\vP}{\ensuremath{I_H^h}}
\newcommand{\vS}{\ensuremath{S}}
\newcommand{\vR}{\ensuremath{I_h^H}}
\newcommand{\vI}{\ensuremath{\hat I_h^H}}
\newcommand{\vV}{\ensuremath{\mathbf{V}}}
\newcommand{\vF}{\ensuremath{F}}
\newcommand{\vtau}{\ensuremath{\mathbf{\tau}}}
\title{Exploiting structure with implicit methods}
\subtitle{This talk: \url{http://59A2.org/files/20141031-Structure.pdf}}
\begin{comment}
Implicit solvers are uniquely powerful instruments for managing
disparate scales as well as many tasks in the analysis of models, but
performance can be subtle and challenging. "Black-box" solvers suffer
From limited applicability and poor efficiency on modern hardware.
Building efficient solvers relies increasingly on pairing
problem-specific structure with architectural capability and
expressing the resulting algorithms in terms of existing software.
Library design is the inverse problem: create maximally-reusable
components to satisfy a diverse range of present and future customers
and architectures. This talk discusses recent examples in PETSc's
evolution and our vision for implicit solvers.
\end{comment}
\author{{\bf Jed Brown} \texttt{jedbrown@mcs.anl.gov} (ANL and CU Boulder) \\
Collaborators in this work: \\
\quad Mark Adams (LBL), Peter Brune (ANL/Google), Emil Constantinescu (ANL), \\
Debojyoti Ghosh (ANL), Matt Knepley (UChicago), \\
Dave May (ETH Z\"urich, Lois Curfman McInnes (ANL), \\
Barry Smith (ANL))
}
% - Use the \inst command only if there are several affiliations.
% - Keep it simple, no one is interested in your street address.
% \institute
% {
% Mathematics and Computer Science Division \\ Argonne National Laboratory
% }
\date{UC Merced, 2014-10-31}
% This is only inserted into the PDF information catalog. Can be left
% out.
\subject{Talks}
% If you have a file called "university-logo-filename.xxx", where xxx
% is a graphic format that can be processed by latex or pdflatex,
% resp., then you can add a logo as follows:
% \pgfdeclareimage[height=0.5cm]{university-logo}{university-logo-filename}
% \logo{\pgfuseimage{university-logo}}
% Delete this, if you do not want the table of contents to pop up at
% the beginning of each subsection:
% \AtBeginSubsection[]
% {
% \begin{frame}<beamer>
% \frametitle{Outline}
% \tableofcontents[currentsection,currentsubsection]
% \end{frame}
% }
\AtBeginSection[]
{
\begin{frame}<beamer>
\frametitle{Outline}
\tableofcontents[currentsection]
\end{frame}
}
% If you wish to uncover everything in a step-wise fashion, uncomment
% the following command:
% \beamerdefaultoverlayspecification{<+->}
\begin{document}
\lstset{language=C}
\normalem
\begin{frame}
\titlepage
\end{frame}
\begin{frame}{Why implicit?}
\begin{itemize}
\item Nature has many spatial and temporal scales
\begin{itemize}
\item Porous media, structures, fluids, kinetics
\end{itemize}
\item Science/engineering problem statement does not weak scale
\begin{itemize}
\item More time steps required at high resolution
\end{itemize}
\item Robust discretizations and implicit solvers are needed to cope
\item Computer architecture is increasingly hierarchical
\begin{itemize}
\item algorithms should conform to this structure
\end{itemize}
\item Sparse matrices are comfortable, but outdated
\begin{itemize}
\item Algebraic multigrid, factorization
\item Memory bandwidth-limited
\end{itemize}
\item ``black box'' solvers are not sustainable
\begin{itemize}
\item optimal solvers must accurately handle all scales
\item optimality is crucial for large-scale problems
\item hardware puts up a spirited fight to abstraction
\end{itemize}
\end{itemize}
\end{frame}
\input{slides/MonolithicOrSplit.tex}
\input{slides/PETSc/Coupling.tex}
\input{slides/FieldSplit.tex}
\input{slides/PETSc/LocalSpaces.tex}
\includepdf[pages=1-2]{davemay.pdf}
\input{slides/HydrostaticEigen.tex}
\begin{frame}{Plugins in PETSc}
\begin{block}{Philosophy: Everything has a plugin architecture}
\begin{itemize}
\item Vectors, Matrices, Coloring/ordering/partitioning algorithms
\item Preconditioners, Krylov accelerators
\item Nonlinear solvers, Time integrators
\item Spatial discretizations/topology$^*$
\end{itemize}
\end{block}
\begin{example}
Third party supplies matrix format and associated preconditioner, distributes
compiled shared library. Application user loads plugin at runtime, no source
code in sight.
\end{example}
\end{frame}
\begin{frame}[shrink=5]{Performance of assembled versus unassembled}
\vspace{1ex}
\includegraphics[width=\textwidth]{figures/TensorVsAssembly} \\
\begin{itemize}
\item Arithmetic intensity for $\Qk p$ elements
\begin{itemize}
\item $< \frac 1 4$ (assembled), $\approx 10$ (unassembled), $\approx 4$ to $8$ (hardware)
\end{itemize}
\item store Jacobian information at Quass quadrature points, can use AD
\end{itemize}
\end{frame}
\input{slides/Dohp/StokesScaling.tex}
\begin{frame}{\texttt{pTatin3d}: Long-term lithosphere dynamics}
\begin{center}
\includegraphics[width=0.8\textwidth]{figures/LaetiRifting}
\end{center}
\begin{itemize}
\item Dave May (ETH Z\"urich), Laetitia Le Pourhiet (UPMC Paris)
\item Visco-elasto-plastic rheology
\item Material-point method for material composition, $10^{10}$ jumps
\item Large deformation, post-failure analysis
\item Free surface: $Q_2 - P_1^{\text{disc}}$ (non-affine)
\end{itemize}
\end{frame}
\begin{frame}{\texttt{pTatin3d}: Long-term lithosphere dynamics}
\begin{itemize}
\item Assembled matrices: $9216 F/38912 B = \alert{0.235 F/B}$
\begin{itemize}
\item Problem size limited by memory
\item Mediocre performance, limited by memory bandwidth
\item Poor scalability within a node (memory contention)
\item Lots of experimentation with different algorithms
\item Multigrid: matrix-free on finest levels
\end{itemize}
\item Matrix-free: $51435 F/824 B = \alert{62.42 F/B}$
\begin{itemize}
\item $81\times 27$ element gradient matrix
\item Element setup computes physical gradient matrix
\item $1.5\times$ speedup when using all cores
\end{itemize}
\item Tensor-product matrix-free: $16686 F/824 B = \alert{20.25 F/B}$
\begin{itemize}
\item Tensor contractions with $3\times 3$ 1D matrices
\item Tiny working set, vectorize over 4 elements within L1 cache
\item 30\% of Haswell FMA peak, register load/store limited
\item $7\times$ speedup ($5\times$ speedup on Sandy Bridge AVX)
\end{itemize}
\end{itemize}
\end{frame}
\input{slides/HardwareArithmeticIntensity.tex}
\begin{frame}{This is a dead end}
\begin{itemize}
\item Arithmetic intensity $< 1/4$
\item Idea: multiple right hand sides
\begin{equation*}
\frac{(2 k \text{ flops})(\text{bandwidth})}{\texttt{sizeof(Scalar)} + \texttt{sizeof(Int)}}, \quad k \ll \text{avg. nz/row}
\end{equation*}
\item Problem: popular algorithms have nested data dependencies
\begin{itemize}
\item Time step \\
\qquad Nonlinear solve \\
\qquad \qquad Krylov solve \\
\qquad \qquad \qquad Preconditioner/sparse matrix
\end{itemize}
\item Cannot parallelize/vectorize these nested loops
\item<2> \alert{Can we create new algorithms to reorder/fuse loops?}
\begin{itemize}
\item Reduce latency-sensitivity for communication
\item Reduce memory bandwidth (reuse matrix)
\item Implicit Runge-Kutta, creates tensor product structure
\item Full space/one-shot methods for PDE-constrained optimization
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}{Beyond global linearization: FAS multigrid}
\includegraphics[width=\textwidth]{figures/BruneNGMRESFAS.png}
\begin{itemize}
\item Geometric coarse grids and rediscretization
\end{itemize}
\end{frame}
\begin{frame}{Lagged quasi-Newton for nonlinear elasticity}
\begin{tabular}{llll llll}
\toprule
Method & Lag & LS & Linear Solve & Its. & $F(u)$ & Jacobian & $P^{-1}$ \\
\midrule
LBFGS & 3 & cp & \texttt{preonly} & 18 & 37 & 5 & 18 \\
LBFGS & 3 & cp & \num{1e-05} & 21 & 43 & 6 & 173 \\
LBFGS & 6 & cp & \texttt{preonly} & 24 & 49 & 4 & 24 \\
LBFGS & 6 & cp & \num{1e-05} & 30 & 61 & 5 & 266 \\[1ex]
JFNK & 0 & cp & \texttt{preonly} & 11 & 23 & 11 & 11 \\
JFNK & 0 & cp & \num{1e-05} & 8 & 69 & 8 & 60 \\
JFNK & 1 & cp & \texttt{preonly} & 15 & 31 & 8 & 15 \\
JFNK & 1 & cp & \num{1e-05} & 7 & 2835 & 4 & 2827 \\
JFNK & 3 & cp & \texttt{preonly} & 23 & 47 & 6 & 23 \\
JFNK & 3 & cp & \num{1e-05} & 7 & 3143 & 2 & 3135
\end{tabular}
\begin{itemize}
\item B and Brune, MC2013
\end{itemize}
\end{frame}
\input{slides/PETSc/TSARKIMEX.tex}
% \input{slides/MG/TauFAS.tex}
\begin{frame}{$\tau$ corrections}
\begin{figure}
\centering
\begin{subfigure}[b]{0.18\textwidth}
\includegraphics[width=\textwidth]{figures/MG/ElasticityCompressTrim}
%\caption{Initial solution.}\label{fig:elast-initial}
\end{subfigure} ~
\begin{subfigure}[b]{0.18\textwidth}
\includegraphics[width=\textwidth]{figures/MG/ElasticityCompressShearTrim}
%\caption{Increment.}\label{fig:elast-increment}
\end{subfigure} ~
\begin{subfigure}[b]{0.28\textwidth}
\includegraphics[width=\textwidth]{figures/MG/ElasticityCompressErrorNoTauTrim}
%\caption{Smoothed error without $\tau$.}\label{fig:elast-error-notau}
\end{subfigure} ~
\begin{subfigure}[b]{0.28\textwidth}
\includegraphics[width=\textwidth]{figures/MG/ElasticityCompressErrorTauTrim}
%\caption{Smoothed error with $\tau$.}\label{fig:elast-error-tau}
\end{subfigure}
\begin{itemize}
\item Plane strain elasticity, $E=1000,\nu=0.4$ inclusions in $E=1,\nu=0.2$ material, coarsen by $3^2$.
\item Solve initial problem everywhere and compute $\tau_h^H = A^H \hat I_h^H u^h - I_h^H A^h u^h$
\item Change boundary conditions and solve FAS coarse problem
\begin{equation*}
N^H \acute u^H = \underbrace{I_h^H \acute f^h}_{\acute f^H} + \underbrace{N^H \hat I_h^H \tilde u^h - I_h^H N^h \tilde u^h}_{\tau_h^H}
\end{equation*}
\item Prolong, post-smooth, compute error $e^h = \acute u^h - (N^h)^{-1} \acute f^h$
\item<2> \alert{Coarse grid \emph{with $\tau$} is nearly $10\times$ better accuracy}
\end{itemize}
% \caption{Plane strain elasticity, $E=1000,\nu=0.4$ inclusions in $E=1,\nu=0.2$ material. 2-level multigrid with coarsening factor of $3^2$.
% Panes (a) and (b) show the deformed body colored by strain.
% The initial problem of compression by 0.2 from the right is solved (a) and $\tau = A^H \hat I_h^H u^h - I_h^H A^h u^h$ is computed.
% Then a shear increment of 0.1 in the $y$ direction is added to the boundary condition, and the coarse-level problem is resolved, interpolated to the fine-grid, and a post-smoother is applied.
% When the coarse problem is solved without a $\tau$ correction (c), the displacement error is nearly $10\times$ larger than when $\tau$ is included in the right hand side of the coarse problem (d).
% }\label{fig:tau-valid}
% ./ex49 -mx 90 -my 90 -da_refine_x 3 -da_refine_y 3 -elas_ksp_converged_reason -elas_ksp_rtol 1e-8 -no_view -c_str 3 -sponge_E0 1 -sponge_E1 1e3 -sponge_nu0 0.4 -sponge_nu1 0.2 -sponge_t 3 -sponge_w 9 -u_o vtk:ex49_sol.vts -use_nonsymbc -elas_pc_type mg -elas_pc_mg_levels 2 -elas_pc_mg_galerkin -tau1_o vtk:ex49_tau1.vts -tau2_o vtk:ex49_tau2.vts -taudiff_o vtk:ex49_taudiff.vts -u2_o vtk:ex49_sol2.vts -u2c_o vtk:ex49_sol2c.vts -u3_o vtk:ex49_sol3.vts -u4_o vtk:ex49_sol4.vts -u2err_o vtk:ex49_sol2err.vts -u3err_o vtk:ex49_sol3err.vts -u3c_o vtk:ex49_sol3c.vts -tau3_o vtk:ex49_tau3.vts
\end{figure}
\end{frame}
\input{slides/MG/LowComm.tex}
\begin{frame}{Segmental refinement: no horizontal communication}
\begin{itemize}
\item 27-point second-order stencil, manufactured analytic solution
\item 5 SR levels: $16^3$ cells/process local coarse grid
\item $\text{Overlap} = \text{Base} + (L-\ell) \text{Increment}$
\begin{itemize}
\item Implementation requires even number of cells---round down.
\end{itemize}
\item FMG with $V(2,2)$ cycles
\end{itemize}
\begin{columns}
\begin{column}{0.4\textwidth}
\begin{table}\small
\centering\caption{$\norm{e_{SR}}_\infty / \norm{e_{FMG}}_\infty$}\label{tab:sr-error}
\begin{tabular}{l rrr}
\toprule
& \multicolumn{3}{c}{Base} \\
Increment & 1 & 2 & 3 \\
\midrule
1 & {\color{red} 1.59} & {\color{red} 2.34} & 1.00 \\
2 & 1.00 & 1.00 & 1.00 \\
3 & 1.00 & 1.00 & 1.00 \\
\bottomrule
\end{tabular}
\end{table}
\end{column}
\begin{column}{0.6\textwidth}
\includegraphics[width=\textwidth]{figures/MG/weak_scaling_edison-eps-converted-to.pdf}
\end{column}
\end{columns}
\end{frame}
\input{slides/MG/SmoothingNonlinearProblems.tex}
\begin{frame}[fragile]{Multiscale compression and recovery using $\tau$ form}
\begin{tikzpicture}
[scale=0.7,every node/.style={scale=0.7},
>=stealth,
restrict/.style={thick,double},
prolong/.style={thick,double},
cprestrict/.style={green!50!black,thick,double,dashed},
control/.style={rectangle,red!40!black,draw=red!40!black,thick},
mglevel/.style={rounded rectangle,draw=blue!50!black,fill=blue!20,thick,minimum size=6mm},
checkpoint/.style={rectangle,draw=green!50!black,fill=green!20,thick,minimum size=6mm},
mglevelhide/.style={rounded rectangle,draw=gray!50!black,fill=gray!20,thick,minimum size=6mm},
tau/.style={text=red!50!black,draw=red!50!black,fill=red!10,inner sep=1pt},
crelax/.style={text=green!50!black,fill=green!10,inner sep=0pt}
]
\begin{scope}
\newcommand\mgdx{1.9em}
\newcommand\mgdy{2.5em}
\newcommand\mgloc[4]{(#1 + #4*\mgdx*#3,#2 + \mgdy*#3)}
\node[mglevel] (fine0) at \mgloc{0}{0}{4}{-1} {\mglevelfine};
\node[mglevel] (finem1down0) at \mgloc{0}{0}{3}{-1} {};
\node[mglevel] (cp1down0) at \mgloc{0}{0}{2}{-1} {$\mglevelcp+1$};
\node[mglevel] (cpdown0) at \mgloc{0}{0}{1}{-1} {\mglevelcp};
\node[mglevel] (coarser0) at \mgloc{0}{0}{0}{0} {\ldots};
\node[mglevelhide] (cpup0) at \mgloc{0}{0}{1}{1} {};
\node (cp1up0) at \mgloc{0}{0}{2}{1} {};
\node (cpdown1) at \mgloc{4em}{0}{1}{-1} {};
\node[mglevelhide] (coarser1) at \mgloc{4em}{0}{0}{1} {\ldots};
\node[mglevel] (cpup1) at \mgloc{4em}{0}{1}{1} {\mglevelcp};
\node[mglevel] (cp1up1) at \mgloc{4em}{0}{2}{1} {$\mglevelcp+1$};
\node[mglevel] (finem1up1) at \mgloc{4em}{0}{3}{1} {};
\node[mglevel] (fine1) at \mgloc{4em}{0}{4}{1} {\mglevelfine};
\draw[->,restrict,dashed] (fine0) -- (finem1down0);
\draw[->,restrict] (finem1down0) -- (cp1down0);
\draw[->,restrict] (cp1down0) -- (cpdown0);
\draw[->,restrict,dashed] (cpdown0) -- (coarser0);
\draw[->,prolong,dashed] (coarser0) -- (cpup0);
\draw[->,prolong,dashed] (cpup0) -- (cp1up0);
\draw[->,restrict,dashed] (cpdown1) -- (coarser1);
\draw[->,prolong,dashed] (coarser1) -- (cpup1);
\draw[->,prolong] (cpup1) -- (cp1up1);
\draw[->,prolong] (cp1up1) -- (finem1up1);
\draw[->,prolong,dashed] (finem1up1) -- (fine1);
\node[checkpoint] at (4em + \mgdx*4,\mgdy) (cp) {CP};
\draw[>->,cprestrict] (fine1) -- node[below,sloped] {Restrict} (cp);
\node[left=\mgdx of fine0] (bnanchor) {};
\node[control,fill=red!20] at (1.1*\mgdx,3*\mgdy) {Solve $F(u^n;b^n) = 0$};
\node[mglevel,right=of fine1] (finedt) {next solve};
\draw[->, >=stealth, control] (fine1) to[out=20,in=170] node[above] {$b^{n+1}(u^n,b^n)$} (finedt);
\draw[->, >=stealth, control] (bnanchor) to[out=45,in=155] node[above] {$b^n$} (fine0);
% Recovery process
\begin{scope}[xshift=8*\mgdx]
\node[checkpoint] (rcp) at \mgloc{0}{0}{0}{0} {CP};
\node[mglevel] (r0a) at \mgloc{0}{\mgdy}{0}{0} {CR};
\node[mglevel] (r1a) at \mgloc{0}{\mgdy}{1}{1} {};
\node[mglevel] (r0b) at \mgloc{2*\mgdx}{\mgdy}{0}{0} {CR};
\node[mglevel] (r1b) at \mgloc{2*\mgdx}{\mgdy}{1}{1} {};
\node[mglevel] (r2b) at \mgloc{2*\mgdx}{\mgdy}{2}{1} {\mglevelfine};
\node[mglevel] (r1c) at \mgloc{6*\mgdx}{\mgdy}{1}{-1} {};
\node[mglevel] (r0d) at \mgloc{6*\mgdx}{\mgdy}{0}{0} {CR};
\node[mglevel] (r1d) at \mgloc{6*\mgdx}{\mgdy}{1}{1} {};
\node[mglevel] (r2d) at \mgloc{6*\mgdx}{\mgdy}{2}{1} {\mglevelfine};
\draw[-,prolong,green!50!black] (rcp) -- (r0a);
\draw[->,prolong] (r0a) -- (r1a);
\draw[->,restrict] (r1a) -- (r0b);
\draw[->,restrict] (r0b) -- (r1b);
\draw[->,restrict,dashed] (r1b) -- (r2b);
\draw[->,restrict,dashed] (r2b) -- (r1c);
\draw[->,restrict] (r1c) -- (r0d);
\draw[->,restrict] (r0d) -- (r1d);
\draw[->,restrict,dashed] (r1d) -- (r2d);
\foreach \smooth in {finem1down0, cp1down0, cpdown0, coarser0,
cpup1, cp1up1, finem1up1,
r0b,r1c,r0d,r1d} {
\node[above left=-5pt of \smooth.west,tau] {$\tau$};
}
\node[rectangle,fill=none,draw=green!50!black,thick,fit=(rcp)(r2d)] (recoverbox) {};
\node[rectangle,draw=green!50!black,fill=green!20,thick,minimum size=6mm,above={0cm of recoverbox.south east},anchor=south east] (recover) {FMG Recovery};
\end{scope}
\node (notation) at (\mgdx,5*\mgdy) {
\begin{minipage}{18em}\small\sf
\begin{itemize}\addtolength{\itemsep}{-5pt}
\item checkpoint converged coarse state
\item recover using FMG anchored at $\mglevelcp+1$
\item needs only $\mglevelcp$ neighbor points
\item $\tau$ correction is local
\end{itemize}
\end{minipage}
};
\end{scope}
\end{tikzpicture}
\begin{itemize}
\item Normal multigrid cycles visit all levels moving from $n \to n+1$
\item FMG recovery only accesses levels finer than $\ell_{CP}$
\item Only failed processes and neighbors participate in recovery
\item Lightweight checkpointing for transient adjoint computation
\item Postprocessing applications, e.g., in-situ visualization at high temporal resolution in part of the domain
\end{itemize}
\end{frame}
\input{slides/MG/HPGMGFE.tex}
\begin{frame}\large
\begin{itemize}
\item Maximize science per Watt
\item Huge scope remains at problem formulation
\item Raise level of abstraction at which a problem is formally specified
\item Algorithmic optimality is crucial
\item Real problems are messy
\item Performance is always messy at scale
\item Improve matrix-free abstractions, robustness, diagnostics
\item Ideas are easy, implementation and practical issues are hard
\item Better language/library support for aggregating
\end{itemize}
\end{frame}
\end{document}