-
Notifications
You must be signed in to change notification settings - Fork 7
/
20140408-FastIRK.tex
409 lines (369 loc) · 13.5 KB
/
20140408-FastIRK.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
% \documentclass[handout]{beamer}
\documentclass{beamer}
\mode<presentation>
{
\usetheme{ANLBlue}
% \usefonttheme[onlymath]{serif}
% \usetheme{Singapore}
% \usetheme{Warsaw}
% \usetheme{Malmoe}
% \useinnertheme{circles}
% \useoutertheme{infolines}
% \useinnertheme{rounded}
\setbeamercovered{transparent=20}
}
\usepackage[english]{babel}
\usepackage[latin1]{inputenc}
\usepackage{alltt,listings,multirow,ulem,siunitx}
\usepackage[absolute,overlay]{textpos}
\TPGrid{1}{1}
\usepackage{pdfpages}
\usepackage{ulem}
\usepackage{multimedia}
\usepackage{multicol}
\newcommand\hmmax{0}
\newcommand\bmmax{0}
\usepackage{bm}
\usepackage{comment}
\usepackage{subcaption}
% font definitions, try \usepackage{ae} instead of the following
% three lines if you don't like this look
\usepackage{mathptmx}
\usepackage[scaled=.90]{helvet}
% \usepackage{courier}
\usepackage[T1]{fontenc}
\usepackage{tikz}
\usetikzlibrary{decorations.pathreplacing}
\usetikzlibrary{shadows,arrows,shapes.misc,shapes.arrows,shapes.multipart,arrows,decorations.pathmorphing,backgrounds,positioning,fit,petri,calc,shadows,chains,matrix}
\newcommand\vvec{\bm v}
\newcommand\bvec{\bm b}
\newcommand\bxk{\bvec_0 \times \kappa_0 \cdot \nabla}
\newcommand\delp{\nabla_\perp}
% \usepackage{pgfpages}
% \pgfpagesuselayout{4 on 1}[a4paper,landscape,border shrink=5mm]
\usepackage{JedMacros}
\newcommand{\timeR}{t_{\mathrm{R}}}
\newcommand{\timeW}{t_{\mathrm{W}}}
\newcommand{\mglevel}{\ensuremath{\ell}}
\newcommand{\mglevelcp}{\ensuremath{\mglevel_{\mathrm{cp}}}}
\newcommand{\mglevelcoarse}{\ensuremath{\mglevel_{\mathrm{coarse}}}}
\newcommand{\mglevelfine}{\ensuremath{\mglevel_{\mathrm{fine}}}}
%solution and residual
\newcommand{\vx}{\ensuremath{x}}
\newcommand{\vc}{\ensuremath{\hat{x}}}
\newcommand{\vr}{\ensuremath{r}}
\newcommand{\vb}{\ensuremath{b}}
%operators
\newcommand{\vA}{\ensuremath{A}}
\newcommand{\vP}{\ensuremath{I_H^h}}
\newcommand{\vS}{\ensuremath{S}}
\newcommand{\vR}{\ensuremath{I_h^H}}
\newcommand{\vI}{\ensuremath{\hat I_h^H}}
\newcommand{\vV}{\ensuremath{\mathbf{V}}}
\newcommand{\vF}{\ensuremath{F}}
\newcommand{\vtau}{\ensuremath{\mathbf{\tau}}}
\title{Fast solvers for implicit Runge-Kutta}
\author{{\bf Jed Brown} \texttt{jedbrown@mcs.anl.gov} (ANL and CU Boulder) \\
Debojyoti Ghosh (ANL)}
% - Use the \inst command only if there are several affiliations.
% - Keep it simple, no one is interested in your street address.
% \institute
% {
% Mathematics and Computer Science Division \\ Argonne National Laboratory
% }
\date{Copper Mountain, 2014-04-08 \\
This talk: \url{http://59A2.org/files/20140408-FastIRK.pdf}}
% This is only inserted into the PDF information catalog. Can be left
% out.
\subject{Talks}
% If you have a file called "university-logo-filename.xxx", where xxx
% is a graphic format that can be processed by latex or pdflatex,
% resp., then you can add a logo as follows:
% \pgfdeclareimage[height=0.5cm]{university-logo}{university-logo-filename}
% \logo{\pgfuseimage{university-logo}}
% Delete this, if you do not want the table of contents to pop up at
% the beginning of each subsection:
% \AtBeginSubsection[]
% {
% \begin{frame}<beamer>
% \frametitle{Outline}
% \tableofcontents[currentsection,currentsubsection]
% \end{frame}
% }
\AtBeginSection[]
{
\begin{frame}<beamer>
\frametitle{Outline}
\tableofcontents[currentsection]
\end{frame}
}
% If you wish to uncover everything in a step-wise fashion, uncomment
% the following command:
% \beamerdefaultoverlayspecification{<+->}
\begin{document}
\lstset{language=C}
\normalem
\begin{frame}
\titlepage
\end{frame}
\begin{frame}{Motivation}
\begin{itemize}
\item Hardware trends
\begin{itemize}
\item Memory bandwidth a precious commodity (8+ flops/byte)
\item Vectorization necessary for floating point performance
\item Conflicting demands of cache reuse and vectorization
\item Can deliver bandwidth, but latency is hard
\end{itemize}
\item Assembled sparse linear algebra is doomed!
\begin{itemize}
\item Limited by memory bandwidth (1 flop/6 bytes)
\item No vectorization without blocking
\end{itemize}
\item Spatial-domain vectorization is \emph{intrusive}
\begin{itemize}
\item Must be unassembled to avoid bandwidth bottleneck
\item Whether it is ``hard'' depends on discretization
\item Geometry, boundary conditions, and adaptivity
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}{Sparse linear algebra is dead (long live sparse \ldots)}
\begin{itemize}
\item Arithmetic intensity $< 1/4$
\item Idea: multiple right hand sides
\begin{equation*}
\frac{(2 k \text{ flops})(\text{bandwidth})}{\texttt{sizeof(Scalar)} + \texttt{sizeof(Int)}}, \quad k \ll \text{avg. nz/row}
\end{equation*}
\item Problem: popular algorithms have nested data dependencies
\begin{itemize}
\item Time step \\
\qquad Nonlinear solve \\
\qquad \qquad Krylov solve \\
\qquad \qquad \qquad Preconditioner/sparse matrix
\end{itemize}
\item Cannot parallelize/vectorize these nested loops
\item<2> \alert{Can we create new algorithms to reorder/fuse loops?}
\begin{itemize}
\item Reduce latency-sensitivity for communication
\item Reduce memory bandwidth (reuse matrix while in cache)
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}{Attempt: $s$-step methods in 3D}
\includegraphics[width=1.1\textwidth]{figures/SStepEfficiency.pdf}
\begin{itemize}
\item Limited choice of preconditioners (none optimal, surface/volume)
\item Amortizing message latency is most important for strong-scaling
\item $s$-step methods have high overhead for small subdomains
\end{itemize}
\end{frame}
\begin{frame}{Attempt: distribute in time (multilevel SDC/Parareal)}
\includegraphics[width=0.9\textwidth]{figures/EmmettMinionPFASSTCost.png}
\begin{itemize}
\item PFASST algorithm (Emmett and Minion, 2012)
\item Zero-latency messages (cf. performance model of $s$-step)
\item Spectral Deferred Correction: iterative, converges to IRK (Gauss, Radau, \ldots)
\item Stiff problems use implicit basic integrator (synchronizing on spatial communicator)
\end{itemize}
\end{frame}
\begin{frame}{Problems with SDC and time-parallel}
\includegraphics[width=\textwidth]{figures/TS/SDCScalingEmmett.png} \\
c/o Matthew Emmett, parallel compared to sequential SDC
\begin{itemize}
\item Iteration count not uniform in $s$; efficiency starts low
\item Low arithmetic intensity; tight error tolerance (cf. Crank-Nicolson)
\item Parabolic space-time (Greenwald and Brandt; Horton and Vandewalle)
\end{itemize}
\end{frame}
\begin{frame}{Runge-Kutta methods}
\begin{gather*}
\dot u = F(u) \\
\underbrace{
\begin{pmatrix}
y_1 \\
\vdots \\
y_s
\end{pmatrix}}_Y =
u^{n} + h
\underbrace{
\begin{bmatrix}
a_{11} & \dotsb & a_{1s} \\
\vdots & \ddots & \vdots \\
a_{s1} & \dotsb & a_{ss}
\end{bmatrix}}_A
F
\begin{pmatrix}
y_1 \\
\vdots \\
y_s
\end{pmatrix} \\
u^{n+1} = u^n + b^T Y
\end{gather*}
\begin{itemize}
\item General framework for one-step methods
\item Diagonally implicit: $A$ lower triangular, stage order 1 (or 2 with explicit first stage)
\item Singly diagonally implicit: all $A_{ii}$ equal, reuse solver setup, stage order 1
\item If $A$ is a general full matrix, all stages are coupled, ``implicit RK''
\end{itemize}
\end{frame}
\begin{frame}{Implicit Runge-Kutta}
\begin{center}
\begin{tabular}{>{$}c<{$} | >{$}c<{$} >{$}c<{$} >{$}c<{$}}
\frac 1 2 - \frac{\sqrt{15}}{10} & \frac{5}{36} & \frac 2 9 - \frac{\sqrt{15}}{15} & \frac{5}{36} - \frac{\sqrt{15}}{30} \\
\frac 1 2 & \frac{5}{36} + \frac{\sqrt{15}}{24} & \frac 2 9 & \frac{5}{36} - \frac{\sqrt{15}}{24} \\
\frac 1 2 - \frac{\sqrt{15}}{10} & \frac{5}{36} + \frac{\sqrt{15}}{30} & \frac 2 9 + \frac{\sqrt{15}}{15} & \frac{5}{36} \\[4pt]
\hline
\vspace{4pt}
& \frac{5}{18} & \frac 4 9 & \frac{5}{18}
\end{tabular}
\end{center}
\begin{itemize}
\item Excellent accuracy and stability properties
\item Gauss methods with $s$ stages
\begin{itemize}
\item order $2s$, $(s,s)$ Pad\'e approximation to the exponential
\item $A$-stable, symplectic
\end{itemize}
\item Radau (IIA) methods with $s$ stages
\begin{itemize}
\item order $2s-1$, $A$-stable, $L$-stable
\end{itemize}
\item Lobatto (IIIC) methods with $s$ stages
\begin{itemize}
\item order $2s-2$, $A$-stable, $L$-stable, self-adjoint
\end{itemize}
\item Stage order $s$ or $s+1$
\end{itemize}
\end{frame}
\begin{frame}{Method of Butcher (1976) and Bickart (1977)}
\begin{itemize}
\item Newton linearize Runge-Kutta system at $u^*$
\begin{align*}
Y &= u^{n} + h A F(Y) & \big[ I_s \otimes I_n + h A \otimes J(u^*)\big ] \delta Y &= RHS
\end{align*}
\item Solve linear system with tensor product operator
\begin{equation*}
\hat G = S \otimes I_n + I_s \otimes J
\end{equation*}
where $S = (hA)^{-1}$ is $s\times s$ dense, $J = -\partial F(u)/\partial u$ sparse
\item SDC (2000) is Gauss-Seidel with low-order corrector
\item Butcher/Bickart method: diagonalize $S = X \Lambda X^{-1}$
\begin{itemize}
\item $\Lambda \otimes I_n + I_s \otimes J$
\item $s$ decoupled solves
\item Complex eigenvalues (overhead for real problem)
\end{itemize}
\item Problem: $X$ is exponentially ill-conditioned wrt. $s$
\item We avoid diagonalization
\begin{itemize}
\item Permute $\hat G$ to reuse $J$: $G = I_n \otimes S + J \otimes I_s$
\item Stages coupled via register transpose at spatial-point granularity
\item Same convergence properties as Butcher/Bickart
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}{MatTAIJ: ``sparse'' tensor product matrices}
\begin{gather*}
G = I_n \otimes S + J \otimes T
\end{gather*}
\begin{itemize}
\item $J$ is parallel and sparse, $S$ and $T$ are small and dense
\item More general than multiple RHS (multivectors)
\item Compare $J \otimes I_s$ to multiple right hand sides in row-major
\item Runge-Kutta systems have $T = I_s$ (permuted from Butcher method)
\item Stream $J$ through cache once, same efficiency as multiple RHS
\item Unintrusive compared to spatial-domain vectorization or $s$-step
\end{itemize}
\end{frame}
\begin{frame}{Convergence with point-block Jacobi preconditioning}
\begin{itemize}
\item 3D centered-difference diffusion problem
\end{itemize}
\begin{tabular}{lrrrrr}
\toprule
Method & order & nsteps & Krylov its. & (Average) \\
\midrule
Gauss 1 & 2 & 16 & 130 & (8.1) \\
Gauss 2 & 4 & 8 & 122 & (15.2) \\
Gauss 4 & 8 & 4 & 100 & (25) \\
Gauss 8 & 16 & 2 & 78 & (39) \\
\bottomrule
\end{tabular}
\end{frame}
\begin{frame}{We really want multigrid}
\begin{itemize}
\item Prolongation: $P \otimes I_s$
\item Coarse operator: $I_n \otimes S + (R J P) \otimes I_s$
\item Larger time steps
\item GMRES(2)/point-block Jacobi smoothing
\item FGMRES outer
\end{itemize}
\begin{tabular}{lrrrrr}
\toprule
Method & order & nsteps & Krylov its. & (Average) \\
\midrule
Gauss 1 & 2 & 16 & 82 & (5.1) \\
Gauss 2 & 4 & 8 & 64 & (8) \\
Gauss 4 & 8 & 4 & 44 & (11) \\
Gauss 8 & 16 & 2 & 42 & (21) \\
\bottomrule
\end{tabular}
\end{frame}
\begin{frame}{Toward a better AMG for IRK/tensor-product systems}
\begin{columns}
\begin{column}{0.3\textwidth}
\includegraphics[width=\textwidth]{figures/TS/Gauss8-Eig.png}
\end{column}
\begin{column}{0.7\textwidth}
\begin{itemize}
\item Start with $\hat R = R \otimes I_s$, $\hat P = P \otimes I_s$
\begin{gather*}
G_{\text{coarse}} = \hat R (I_n \otimes S + J \otimes I_s) \hat P
\end{gather*}
\item Imaginary component slows convergence
\item Idea: rotate eigenvalues on coarse levels \\
Erlangga and Nabben \emph{On a multilevel Krylov method for the Helmholtz equation preconditioned by shifted Laplacian}
\end{itemize}
\end{column}
\end{columns}
\end{frame}
\begin{frame}{Implicit Runge-Kutta for advection}
\begin{table}
\centering
\caption{Total number of iterations (communications or accesses of $J$) to solve linear advection to $t=1$ on a $1024$-point grid using point-block Jacobi preconditioning of implicit Runge-Kutta matrix.
The relative algebraic solver tolerance is $10^{-8}$.}\label{tab:irk-advection}
\begin{tabular}{lrrr}
\toprule
Family & Stages & Order & Iterations \\
\midrule
Crank-Nicolson/Gauss & 1 & 2 & 3627 \\
Gauss & 2 & 4 & 2560 \\
Gauss & 4 & 8 & 1735 \\
Gauss & 8 & 16 & 1442 \\
\bottomrule
\end{tabular}
\end{table}
\begin{itemize}
\item Naive centered-difference discretization
\item Leapfrog requires 1024 iterations at CFL=1
\item This is $A$-stable (can handle dissipation)
\end{itemize}
\end{frame}
\begin{frame}{Outlook on IRK}
\begin{itemize}
\item IRK \emph{unintrusively} offers bandwidth reuse and vectorization
\item No need for complex arithmetic (cf. Butcher and Bickart)
\item Need polynomial smoothers for IRK spectra
\item Change number of stages on spatially-coarse grids ($p$-MG, or even increase)?
\item Experiment with SOR-type smoothers
\begin{itemize}
\item Prefer point-block Jacobi in smoothers for parallelism
\end{itemize}
\item Study efficiency for nonlinear problems
\item Is it possible to speed up advection?
\item Possible IRK correction for IMEX (non-smooth explicit function)
\item PETSc implementation (parallel example running, interface in-progress)
\end{itemize}
\end{frame}
\end{document}