-
Notifications
You must be signed in to change notification settings - Fork 143
/
book.bak
23377 lines (21457 loc) · 809 KB
/
book.bak
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
\documentclass[7x10]{TimesAPriori_MIT}%%7x10
% TODO:
% move binary subtraction from Lif to Lint
\usepackage[utf8]{inputenc}
%% \usepackage{setspace}
%% \doublespacing
\usepackage{listings}
\usepackage{verbatim}
\usepackage{amssymb}
\usepackage{lmodern} % better typewriter font for code
%\usepackage{wrapfig}
\usepackage{multirow}
\usepackage{tcolorbox}
\usepackage{color}
%\usepackage{ifthen}
\usepackage{upquote}
\usepackage[all]{xy}
\usepackage{url}
\definecolor{lightgray}{gray}{1}
\newcommand{\black}[1]{{\color{black} #1}}
%\newcommand{\gray}[1]{{\color{lightgray} #1}}
\newcommand{\gray}[1]{{\color{gray} #1}}
\def\racketEd{0}
\def\pythonEd{1}
\def\edition{1}
% material that is specific to the Racket edition of the book
\newcommand{\racket}[1]{{\if\edition\racketEd{#1}\fi}}
% would like a command for: \if\edition\racketEd\color{olive}
% and : \fi\color{black}
% material that is specific to the Python edition of the book
\newcommand{\python}[1]{{\if\edition\pythonEd #1\fi}}
%% For multiple indices:
%\usepackage{multind} moved this to the file TimesAPriori_MIT.cls. -Jeremy
\makeindex{subject}
%\makeindex{authors}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\if\edition\racketEd
\lstset{%
language=Lisp,
basicstyle=\ttfamily\small,
morekeywords={lambda,match,goto,if,else,then,struct,Integer,Boolean,Vector,Void,Any,while,begin,define,public,override,class},
deletekeywords={read,mapping,vector},
escapechar=|,
columns=flexible,
%moredelim=[is][\color{red}]{~}{~},
showstringspaces=false
}
\fi
\if\edition\pythonEd
\lstset{%
language=Python,
basicstyle=\ttfamily\small,
morekeywords={match,case,bool,int,let},
deletekeywords={},
escapechar=|,
columns=flexible,
%moredelim=[is][\color{red}]{~}{~},
showstringspaces=false
}
\fi
%%% Any shortcut own defined macros place here
%% sample of author macro:
\input{defs}
\newtheorem{exercise}[theorem]{Exercise}
\numberwithin{theorem}{chapter}
\numberwithin{definition}{chapter}
\numberwithin{equation}{chapter}
% Adjusted settings
\setlength{\columnsep}{4pt}
%% \begingroup
%% \setlength{\intextsep}{0pt}%
%% \setlength{\columnsep}{0pt}%
%% \begin{wrapfigure}{r}{0.5\textwidth}
%% \centering\includegraphics[width=\linewidth]{example-image-a}
%% \caption{Basic layout}
%% \end{wrapfigure}
%% \lipsum[1]
%% \endgroup
\newbox\oiintbox
\setbox\oiintbox=\hbox{$\lower2pt\hbox{\huge$\displaystyle\circ$}
\hskip-13pt\displaystyle\int\hskip-7pt\int_{S}\ $}
\def\oiint{\copy\oiintbox}
\def\boldnabla{\hbox{\boldmath$\displaystyle\nabla$}}
%\usepackage{showframe}
\def\ShowFrameLinethickness{0.125pt}
\addbibresource{book.bib}
\if\edition\pythonEd
\addbibresource{python.bib}
\fi
\begin{document}
\frontmatter
%\HalfTitle{Essentials of Compilation \\ An Incremental Approach in \python{Python}\racket{Racket}}
\HalfTitle{Essentials of Compilation}
\halftitlepage
\clearemptydoublepage
\Title{Essentials of Compilation}
\Booksubtitle{An Incremental Approach in \python{Python}\racket{Racket}}
%\edition{First Edition}
\BookAuthor{Jeremy G. Siek}
\imprint{The MIT Press\\
Cambridge, Massachusetts\\
London, England}
\begin{copyrightpage}
\textcopyright\ 2023 Massachusetts Institute of Technology \\[2ex]
This work is subject to a Creative Commons CC-BY-ND-NC license. \\[2ex]
Subject to such license, all rights are reserved. \\[2ex]
\includegraphics{CCBY-logo}
The MIT Press would like to thank the anonymous peer reviewers who
provided comments on drafts of this book. The generous work of
academic experts is essential for establishing the authority and
quality of our publications. We acknowledge with gratitude the
contributions of these otherwise uncredited readers.
This book was set in Times LT Std Roman by the author. Printed and
bound in the United States of America.
Library of Congress Cataloging-in-Publication Data is available.
ISBN:
10 9 8 7 6 5 4 3 2 1
%% Jeremy G. Siek. Available for free viewing
%% or personal downloading under the
%% \href{https://creativecommons.org/licenses/by-nc-nd/2.0/uk/}{CC-BY-NC-ND}
%% license.
%% Copyright in this monograph has been licensed exclusively to The MIT
%% Press, \url{http://mitpress.mit.edu}, which will be releasing the final
%% version to the public in 2022. All inquiries regarding rights should
%% be addressed to The MIT Press, Rights and Permissions Department.
%% \textcopyright\ [YEAR] Massachusetts Institute of Technology
%% All rights reserved. No part of this book may be reproduced in any
%% form by any electronic or mechanical means (including photocopying,
%% recording, or information storage and retrieval) without permission in
%% writing from the publisher.
%% This book was set in LaTeX by Jeremy G. Siek. Printed and bound in the
%% United States of America.
%% Library of Congress Cataloging-in-Publication Data is available.
%% ISBN:
%% 10\quad9\quad8\quad7\quad6\quad5\quad4\quad3\quad2\quad1
\end{copyrightpage}
\dedication{This book is dedicated to Katie, my partner in everything,
my children, who grew up during the writing of this book, and the
programming language students at Indiana University, whose
thoughtful questions made this a better book.}
%% \begin{epigraphpage}
%% \epigraph{First Epigraph line goes here}{Mention author name if any,
%% \textit{Book Name if any}}
%% \epigraph{Second Epigraph line goes here}{Mention author name if any}
%% \end{epigraphpage}
\tableofcontents
%\listoffigures
%\listoftables
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\chapter*{Preface}
\addcontentsline{toc}{fmbm}{Preface}
There is a magical moment when a programmer presses the run button
and the software begins to execute. Somehow a program written in a
high-level language is running on a computer that is capable only of
shuffling bits. Here we reveal the wizardry that makes that moment
possible. Beginning with the groundbreaking work of Backus and
colleagues in the 1950s, computer scientists developed techniques for
constructing programs called \emph{compilers} that automatically
translate high-level programs into machine code.
We take you on a journey through constructing your own compiler for a
small but powerful language. Along the way we explain the essential
concepts, algorithms, and data structures that underlie compilers. We
develop your understanding of how programs are mapped onto computer
hardware, which is helpful in reasoning about properties at the
junction of hardware and software, such as execution time, software
errors, and security vulnerabilities. For those interested in
pursuing compiler construction as a career, our goal is to provide a
stepping-stone to advanced topics such as just-in-time compilation,
program analysis, and program optimization. For those interested in
designing and implementing programming languages, we connect language
design choices to their impact on the compiler and the generated code.
A compiler is typically organized as a sequence of stages that
progressively translate a program to the code that runs on
hardware. We take this approach to the extreme by partitioning our
compiler into a large number of \emph{nanopasses}, each of which
performs a single task. This enables the testing of each pass in
isolation and focuses our attention, making the compiler far easier to
understand.
The most familiar approach to describing compilers is to dedicate each
chapter to one pass. The problem with that approach is that it
obfuscates how language features motivate design choices in a
compiler. We instead take an \emph{incremental} approach in which we
build a complete compiler in each chapter, starting with a small input
language that includes only arithmetic and variables. We add new
language features in subsequent chapters, extending the compiler as
necessary.
Our choice of language features is designed to elicit fundamental
concepts and algorithms used in compilers.
\begin{itemize}
\item We begin with integer arithmetic and local variables in
Chapters~\ref{ch:trees-recur} and \ref{ch:Lvar}, where we introduce
the fundamental tools of compiler construction: \emph{abstract
syntax trees} and \emph{recursive functions}.
\item In Chapter~\ref{ch:register-allocation-Lvar} we apply
\emph{graph coloring} to assign variables to machine registers.
\item Chapter~\ref{ch:Lif} adds conditional expressions, which
motivates an elegant recursive algorithm for translating them into
conditional \code{goto} statements.
\item Chapter~\ref{ch:Lwhile} adds loops\racket{ and mutable
variables}. This elicits the need for \emph{dataflow
analysis} in the register allocator.
\item Chapter~\ref{ch:Lvec} adds heap-allocated tuples, motivating
\emph{garbage collection}.
\item Chapter~\ref{ch:Lfun} adds functions as first-class values
without lexical scoping, similar to functions in the C programming
language~\citep{Kernighan:1988nx}. The reader learns about the
procedure call stack and \emph{calling conventions} and how they interact
with register allocation and garbage collection. The chapter also
describes how to generate efficient tail calls.
\item Chapter~\ref{ch:Llambda} adds anonymous functions with lexical
scoping, that is, \emph{lambda} expressions. The reader learns about
\emph{closure conversion}, in which lambdas are translated into a
combination of functions and tuples.
% Chapter about classes and objects?
\item Chapter~\ref{ch:Ldyn} adds \emph{dynamic typing}. Prior to this
point the input languages are statically typed. The reader extends
the statically typed language with an \code{Any} type that serves
as a target for compiling the dynamically typed language.
%% {\if\edition\pythonEd
%% \item Chapter~\ref{ch:Lobject} adds support for \emph{objects} and
%% \emph{classes}.
%% \fi}
\item Chapter~\ref{ch:Lgrad} uses the \code{Any} type introduced in
Chapter~\ref{ch:Ldyn} to implement a \emph{gradually typed language}
in which different regions of a program may be static or dynamically
typed. The reader implements runtime support for \emph{proxies} that
allow values to safely move between regions.
\item Chapter~\ref{ch:Lpoly} adds \emph{generics} with autoboxing,
leveraging the \code{Any} type and type casts developed in chapters
\ref{ch:Ldyn} and \ref{ch:Lgrad}.
\end{itemize}
There are many language features that we do not include. Our choices
balance the incidental complexity of a feature versus the fundamental
concepts that it exposes. For example, we include tuples and not
records because although they both elicit the study of heap allocation and
garbage collection, records come with more incidental complexity.
Since 2009, drafts of this book have served as the textbook for
sixteen week compiler courses for upper-level undergraduates and
first-year graduate students at the University of Colorado and Indiana
University.
%
Students come into the course having learned the basics of
programming, data structures and algorithms, and discrete
mathematics.
%
At the beginning of the course, students form groups of two to four
people. The groups complete one chapter every two weeks, starting
with chapter~\ref{ch:Lvar} and finishing with
chapter~\ref{ch:Llambda}. Many chapters include a challenge problem
that we assign to the graduate students. The last two weeks of the
course involve a final project in which students design and implement
a compiler extension of their choosing. The last few chapters can be
used in support of these projects. For compiler courses at
universities on the quarter system (about ten weeks in length), we
recommend completing the course through chapter~\ref{ch:Lvec} or
chapter~\ref{ch:Lfun} and providing some scaffolding code to the
students for each compiler pass.
%
The course can be adapted to emphasize functional languages by
skipping chapter~\ref{ch:Lwhile} (loops) and including
chapter~\ref{ch:Llambda} (lambda). The course can be adapted to
dynamically typed languages by including chapter~\ref{ch:Ldyn}.
%
%% \python{A course that emphasizes object-oriented languages would
%% include Chapter~\ref{ch:Lobject}.}
%
Figure~\ref{fig:chapter-dependences} depicts the dependencies between
chapters. Chapter~\ref{ch:Lfun} (functions) depends on
chapter~\ref{ch:Lvec} (tuples) only in the implementation of efficient
tail calls.
This book has been used in compiler courses at California Polytechnic
State University, Portland State University, Rose–Hulman Institute of
Technology, University of Freiburg, University of Massachusetts
Lowell, and the University of Vermont.
\begin{figure}[tp]
\begin{tcolorbox}[colback=white]
{\if\edition\racketEd
\begin{tikzpicture}[baseline=(current bounding box.center)]
\node (C1) at (0,1.5) {\small Ch.~\ref{ch:trees-recur} Preliminaries};
\node (C2) at (4,1.5) {\small Ch.~\ref{ch:Lvar} Variables};
\node (C3) at (8,1.5) {\small Ch.~\ref{ch:register-allocation-Lvar} Registers};
\node (C4) at (0,0) {\small Ch.~\ref{ch:Lif} Conditionals};
\node (C5) at (4,0) {\small Ch.~\ref{ch:Lvec} Tuples};
\node (C6) at (8,0) {\small Ch.~\ref{ch:Lfun} Functions};
\node (C9) at (0,-1.5) {\small Ch.~\ref{ch:Lwhile} Loops};
\node (C8) at (4,-1.5) {\small Ch.~\ref{ch:Ldyn} Dynamic};
\node (C7) at (8,-1.5) {\small Ch.~\ref{ch:Llambda} Lambda};
\node (C10) at (4,-3) {\small Ch.~\ref{ch:Lgrad} Gradual Typing};
\node (C11) at (8,-3) {\small Ch.~\ref{ch:Lpoly} Generics};
\path[->] (C1) edge [above] node {} (C2);
\path[->] (C2) edge [above] node {} (C3);
\path[->] (C3) edge [above] node {} (C4);
\path[->] (C4) edge [above] node {} (C5);
\path[->,style=dotted] (C5) edge [above] node {} (C6);
\path[->] (C5) edge [above] node {} (C7);
\path[->] (C6) edge [above] node {} (C7);
\path[->] (C4) edge [above] node {} (C8);
\path[->] (C4) edge [above] node {} (C9);
\path[->] (C7) edge [above] node {} (C10);
\path[->] (C8) edge [above] node {} (C10);
\path[->] (C10) edge [above] node {} (C11);
\end{tikzpicture}
\fi}
{\if\edition\pythonEd
\begin{tikzpicture}[baseline=(current bounding box.center)]
\node (C1) at (0,1.5) {\small Ch.~\ref{ch:trees-recur} Preliminaries};
\node (C2) at (4,1.5) {\small Ch.~\ref{ch:Lvar} Variables};
\node (C3) at (8,1.5) {\small Ch.~\ref{ch:register-allocation-Lvar} Registers};
\node (C4) at (0,0) {\small Ch.~\ref{ch:Lif} Conditionals};
\node (C5) at (4,0) {\small Ch.~\ref{ch:Lvec} Tuples};
\node (C6) at (8,0) {\small Ch.~\ref{ch:Lfun} Functions};
\node (C9) at (0,-1.5) {\small Ch.~\ref{ch:Lwhile} Loops};
\node (C8) at (4,-1.5) {\small Ch.~\ref{ch:Ldyn} Dynamic};
% \node (CO) at (0,-3) {\small Ch.~\ref{ch:Lobject} Objects};
\node (C7) at (8,-1.5) {\small Ch.~\ref{ch:Llambda} Lambda};
\node (C10) at (4,-3) {\small Ch.~\ref{ch:Lgrad} Gradual Typing};
\node (C11) at (8,-3) {\small Ch.~\ref{ch:Lpoly} Generics};
\path[->] (C1) edge [above] node {} (C2);
\path[->] (C2) edge [above] node {} (C3);
\path[->] (C3) edge [above] node {} (C4);
\path[->] (C4) edge [above] node {} (C5);
\path[->,style=dotted] (C5) edge [above] node {} (C6);
\path[->] (C5) edge [above] node {} (C7);
\path[->] (C6) edge [above] node {} (C7);
\path[->] (C4) edge [above] node {} (C8);
\path[->] (C4) edge [above] node {} (C9);
\path[->] (C7) edge [above] node {} (C10);
\path[->] (C8) edge [above] node {} (C10);
% \path[->] (C8) edge [above] node {} (CO);
\path[->] (C10) edge [above] node {} (C11);
\end{tikzpicture}
\fi}
\end{tcolorbox}
\caption{Diagram of chapter dependencies.}
\label{fig:chapter-dependences}
\end{figure}
\racket{
We use the \href{https://racket-lang.org/}{Racket} language both for
the implementation of the compiler and for the input language, so the
reader should be proficient with Racket or Scheme. There are many
excellent resources for learning Scheme and
Racket~\citep{Dybvig:1987aa,Abelson:1996uq,Friedman:1996aa,Felleisen:2001aa,Felleisen:2013aa,Flatt:2014aa}.
}
\python{
This edition of the book uses \href{https://www.python.org/}{Python}
both for the implementation of the compiler and for the input language, so the
reader should be proficient with Python. There are many
excellent resources for learning Python~\citep{Lutz:2013vp,Barry:2016vj,Sweigart:2019vn,Matthes:2019vs}.
}
The support code for this book is in the GitHub repository at
the following location:
\begin{center}\small\texttt
https://github.com/IUCompilerCourse/
\end{center}
The compiler targets x86 assembly language~\citep{Intel:2015aa}, so it
is helpful but not necessary for the reader to have taken a computer
systems course~\citep{Bryant:2010aa}. We introduce the parts of x86-64
assembly language that are needed in the compiler.
%
We follow the System V calling
conventions~\citep{Bryant:2005aa,Matz:2013aa}, so the assembly code
that we generate works with the runtime system (written in C) when it
is compiled using the GNU C compiler (\code{gcc}) on Linux and MacOS
operating systems on Intel hardware.
%
On the Windows operating system, \code{gcc} uses the Microsoft x64
calling convention~\citep{Microsoft:2018aa,Microsoft:2020aa}. So the
assembly code that we generate does \emph{not} work with the runtime
system on Windows. One workaround is to use a virtual machine with
Linux as the guest operating system.
\section*{Acknowledgments}
The tradition of compiler construction at Indiana University goes back
to research and courses on programming languages by Daniel Friedman in
the 1970s and 1980s. One of his students, Kent Dybvig, implemented
Chez Scheme~\citep{Dybvig:2006aa}, an efficient, production-quality
compiler for Scheme. Throughout the 1990s and 2000s, Dybvig taught
the compiler course and continued the development of Chez Scheme.
%
The compiler course evolved to incorporate novel pedagogical ideas
while also including elements of real-world compilers. One of
Friedman's ideas was to split the compiler into many small
passes. Another idea, called ``the game,'' was to test the code
generated by each pass using interpreters.
Dybvig, with help from his students Dipanwita Sarkar and Andrew Keep,
developed infrastructure to support this approach and evolved the
course to use even smaller
nanopasses~\citep{Sarkar:2004fk,Keep:2012aa}. Many of the compiler
design decisions in this book are inspired by the assignment
descriptions of \citet{Dybvig:2010aa}. In the mid 2000s, a student of
Dybvig named Abdulaziz Ghuloum observed that the front-to-back
organization of the course made it difficult for students to
understand the rationale for the compiler design. Ghuloum proposed the
incremental approach~\citep{Ghuloum:2006bh} on which this book is
based.
We thank the many students who served as teaching assistants for the
compiler course at IU, including Carl Factora, Ryan Scott, Cameron
Swords, and Chris Wailes. We thank Andre Kuhlenschmidt for work on the
garbage collector and x86 interpreter, Michael Vollmer for work on
efficient tail calls, and Michael Vitousek for help with the first
offering of the incremental compiler course at IU.
We thank professors Bor-Yuh Chang, John Clements, Jay McCarthy, Joseph
Near, Ryan Newton, Nate Nystrom, Peter Thiemann, Andrew Tolmach, and
Michael Wollowski for teaching courses based on drafts of this book
and for their feedback. We thank the National Science Foundation for
the grants that helped to support this work: Grant Numbers 1518844,
1763922, and 1814460.
We thank Ronald Garcia for helping Jeremy survive Dybvig's compiler
course in the early 2000s and especially for finding the bug that
sent our garbage collector on a wild goose chase!
\mbox{}\\
\noindent Jeremy G. Siek \\
Bloomington, Indiana
\mainmatter
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\chapter{Preliminaries}
\label{ch:trees-recur}
\setcounter{footnote}{0}
In this chapter we review the basic tools needed to implement a
compiler. Programs are typically input by a programmer as text, that
is, a sequence of characters. The program-as-text representation is
called \emph{concrete syntax}. We use concrete syntax to concisely
write down and talk about programs. Inside the compiler, we use
\emph{abstract syntax trees} (ASTs) to represent programs in a way
that efficiently supports the operations that the compiler needs to
perform.\index{subject}{concrete syntax}\index{subject}{abstract
syntax}\index{subject}{abstract syntax
tree}\index{subject}{AST}\index{subject}{program}\index{subject}{parse}
The process of translating from concrete syntax to abstract syntax is
called \emph{parsing}~\citep{Aho:2006wb}. This book does not cover the
theory and implementation of parsing.
%
\racket{A parser is provided in the support code for translating from
concrete to abstract syntax.}
%
\python{We use Python's \code{ast} module to translate from concrete
to abstract syntax.}
ASTs can be represented inside the compiler in many different ways,
depending on the programming language used to write the compiler.
%
\racket{We use Racket's
\href{https://docs.racket-lang.org/guide/define-struct.html}{\code{struct}}
feature to represent ASTs (section~\ref{sec:ast}).}
%
\python{We use Python classes and objects to represent ASTs, especially the
classes defined in the standard \code{ast} module for the Python
source language.}
%
We use grammars to define the abstract syntax of programming languages
(section~\ref{sec:grammar}) and pattern matching to inspect individual
nodes in an AST (section~\ref{sec:pattern-matching}). We use
recursive functions to construct and deconstruct ASTs
(section~\ref{sec:recursion}). This chapter provides a brief
introduction to these components.
\racket{\index{subject}{struct}}
\python{\index{subject}{class}\index{subject}{object}}
\section{Abstract Syntax Trees}
\label{sec:ast}
Compilers use abstract syntax trees to represent programs because they
often need to ask questions such as, for a given part of a program,
what kind of language feature is it? What are its subparts? Consider
the program on the left and the diagram of its AST on the
right~\eqref{eq:arith-prog}. This program is an addition operation
that has two subparts, a \racket{read}\python{input} operation and a
negation. The negation has another subpart, the integer constant
\code{8}. By using a tree to represent the program, we can easily
follow the links to go from one part of a program to its subparts.
\begin{center}
\begin{minipage}{0.4\textwidth}
\if\edition\racketEd
\begin{lstlisting}
(+ (read) (- 8))
\end{lstlisting}
\fi
\if\edition\pythonEd
\begin{lstlisting}
input_int() + -8
\end{lstlisting}
\fi
\end{minipage}
\begin{minipage}{0.4\textwidth}
\begin{equation}
\begin{tikzpicture}
\node[draw] (plus) at (0 , 0) {\key{+}};
\node[draw] (read) at (-1, -1.5) {{\if\edition\racketEd\footnotesize\key{read}\fi\if\edition\pythonEd\key{input\_int()}\fi}};
\node[draw] (minus) at (1 , -1.5) {$\key{-}$};
\node[draw] (8) at (1 , -3) {\key{8}};
\draw[->] (plus) to (read);
\draw[->] (plus) to (minus);
\draw[->] (minus) to (8);
\end{tikzpicture}
\label{eq:arith-prog}
\end{equation}
\end{minipage}
\end{center}
We use the standard terminology for trees to describe ASTs: each
rectangle above is called a \emph{node}. The arrows connect a node to its
\emph{children}, which are also nodes. The top-most node is the
\emph{root}. Every node except for the root has a \emph{parent} (the
node of which it is the child). If a node has no children, it is a
\emph{leaf} node; otherwise it is an \emph{internal} node.
\index{subject}{node}
\index{subject}{children}
\index{subject}{root}
\index{subject}{parent}
\index{subject}{leaf}
\index{subject}{internal node}
%% Recall that an \emph{symbolic expression} (S-expression) is either
%% \begin{enumerate}
%% \item an atom, or
%% \item a pair of two S-expressions, written $(e_1 \key{.} e_2)$,
%% where $e_1$ and $e_2$ are each an S-expression.
%% \end{enumerate}
%% An \emph{atom} can be a symbol, such as \code{`hello}, a number, the
%% null value \code{'()}, etc. We can create an S-expression in Racket
%% simply by writing a backquote (called a quasi-quote in Racket)
%% followed by the textual representation of the S-expression. It is
%% quite common to use S-expressions to represent a list, such as $a, b
%% ,c$ in the following way:
%% \begin{lstlisting}
%% `(a . (b . (c . ())))
%% \end{lstlisting}
%% Each element of the list is in the first slot of a pair, and the
%% second slot is either the rest of the list or the null value, to mark
%% the end of the list. Such lists are so common that Racket provides
%% special notation for them that removes the need for the periods
%% and so many parenthesis:
%% \begin{lstlisting}
%% `(a b c)
%% \end{lstlisting}
%% The following expression creates an S-expression that represents AST
%% \eqref{eq:arith-prog}.
%% \begin{lstlisting}
%% `(+ (read) (- 8))
%% \end{lstlisting}
%% When using S-expressions to represent ASTs, the convention is to
%% represent each AST node as a list and to put the operation symbol at
%% the front of the list. The rest of the list contains the children. So
%% in the above case, the root AST node has operation \code{`+} and its
%% two children are \code{`(read)} and \code{`(- 8)}, just as in the
%% diagram \eqref{eq:arith-prog}.
%% To build larger S-expressions one often needs to splice together
%% several smaller S-expressions. Racket provides the comma operator to
%% splice an S-expression into a larger one. For example, instead of
%% creating the S-expression for AST \eqref{eq:arith-prog} all at once,
%% we could have first created an S-expression for AST
%% \eqref{eq:arith-neg8} and then spliced that into the addition
%% S-expression.
%% \begin{lstlisting}
%% (define ast1.4 `(- 8))
%% (define ast1_1 `(+ (read) ,ast1.4))
%% \end{lstlisting}
%% In general, the Racket expression that follows the comma (splice)
%% can be any expression that produces an S-expression.
{\if\edition\racketEd
We define a Racket \code{struct} for each kind of node. For this
chapter we require just two kinds of nodes: one for integer constants
and one for primitive operations. The following is the \code{struct}
definition for integer constants.\footnote{All the AST structures are
defined in the file \code{utilities.rkt} in the support code.}
\begin{lstlisting}
(struct Int (value))
\end{lstlisting}
An integer node contains just one thing: the integer value.
We establish the convention that \code{struct} names, such
as \code{Int}, are capitalized.
To create an AST node for the integer $8$, we write \INT{8}.
\begin{lstlisting}
(define eight (Int 8))
\end{lstlisting}
We say that the value created by \INT{8} is an
\emph{instance} of the
\code{Int} structure.
The following is the \code{struct} definition for primitive operations.
\begin{lstlisting}
(struct Prim (op args))
\end{lstlisting}
A primitive operation node includes an operator symbol \code{op} and a
list of child arguments called \code{args}. For example, to create an
AST that negates the number $8$, we write the following.
\begin{lstlisting}
(define neg-eight (Prim '- (list eight)))
\end{lstlisting}
Primitive operations may have zero or more children. The \code{read}
operator has zero:
\begin{lstlisting}
(define rd (Prim 'read '()))
\end{lstlisting}
The addition operator has two children:
\begin{lstlisting}
(define ast1_1 (Prim '+ (list rd neg-eight)))
\end{lstlisting}
We have made a design choice regarding the \code{Prim} structure.
Instead of using one structure for many different operations
(\code{read}, \code{+}, and \code{-}), we could have instead defined a
structure for each operation, as follows:
\begin{lstlisting}
(struct Read ())
(struct Add (left right))
(struct Neg (value))
\end{lstlisting}
The reason that we choose to use just one structure is that many parts
of the compiler can use the same code for the different primitive
operators, so we might as well just write that code once by using a
single structure.
%
\fi}
{\if\edition\pythonEd
We use a Python \code{class} for each kind of node.
The following is the class definition for
constants from the Python \code{ast} module.
\begin{lstlisting}
class Constant:
def __init__(self, value):
self.value = value
\end{lstlisting}
An integer constant node includes just one thing: the integer value.
To create an AST node for the integer $8$, we write \INT{8}.
\begin{lstlisting}
eight = Constant(8)
\end{lstlisting}
We say that the value created by \INT{8} is an
\emph{instance} of the \code{Constant} class.
The following is the class definition for unary operators.
\begin{lstlisting}
class UnaryOp:
def __init__(self, op, operand):
self.op = op
self.operand = operand
\end{lstlisting}
The specific operation is specified by the \code{op} parameter. For
example, the class \code{USub} is for unary subtraction.
(More unary operators are introduced in later chapters.) To create an AST that
negates the number $8$, we write the following.
\begin{lstlisting}
neg_eight = UnaryOp(USub(), eight)
\end{lstlisting}
The call to the \code{input\_int} function is represented by the
\code{Call} and \code{Name} classes.
\begin{lstlisting}
class Call:
def __init__(self, func, args):
self.func = func
self.args = args
class Name:
def __init__(self, id):
self.id = id
\end{lstlisting}
To create an AST node that calls \code{input\_int}, we write
\begin{lstlisting}
read = Call(Name('input_int'), [])
\end{lstlisting}
Finally, to represent the addition in \eqref{eq:arith-prog}, we use
the \code{BinOp} class for binary operators.
\begin{lstlisting}
class BinOp:
def __init__(self, left, op, right):
self.op = op
self.left = left
self.right = right
\end{lstlisting}
Similar to \code{UnaryOp}, the specific operation is specified by the
\code{op} parameter, which for now is just an instance of the
\code{Add} class. So to create the AST
node that adds negative eight to some user input, we write the following.
\begin{lstlisting}
ast1_1 = BinOp(read, Add(), neg_eight)
\end{lstlisting}
\fi}
To compile a program such as \eqref{eq:arith-prog}, we need to know
that the operation associated with the root node is addition and we
need to be able to access its two
children. \racket{Racket}\python{Python} provides pattern matching to
support these kinds of queries, as we see in
section~\ref{sec:pattern-matching}.
We often write down the concrete syntax of a program even when we
actually have in mind the AST, because the concrete syntax is more
concise. We recommend that you always think of programs as abstract
syntax trees.
\section{Grammars}
\label{sec:grammar}
\index{subject}{integer}
\index{subject}{literal}
%\index{subject}{constant}
A programming language can be thought of as a \emph{set} of programs.
The set is infinite (that is, one can always create larger programs),
so one cannot simply describe a language by listing all the
programs in the language. Instead we write down a set of rules, a
\emph{grammar}, for building programs. Grammars are often used to
define the concrete syntax of a language, but they can also be used to
describe the abstract syntax. We write our rules in a variant of
Backus-Naur form (BNF)~\citep{Backus:1960aa,Knuth:1964aa}.
\index{subject}{Backus-Naur form}\index{subject}{BNF} As an example,
we describe a small language, named \LangInt{}, that consists of
integers and arithmetic operations. \index{subject}{grammar}
The first grammar rule for the abstract syntax of \LangInt{} says that an
instance of the \racket{\code{Int} structure}\python{\code{Constant} class} is an expression:
\begin{equation}
\Exp ::= \INT{\Int} \label{eq:arith-int}
\end{equation}
%
Each rule has a left-hand side and a right-hand side.
If you have an AST node that matches the
right-hand side, then you can categorize it according to the
left-hand side.
%
Symbols in typewriter font, such as \racket{\code{Int}}\python{\code{Constant}},
are \emph{terminal} symbols and must literally appear in the program for the
rule to be applicable.\index{subject}{terminal}
%
Our grammars do not mention \emph{white space}, that is, delimiter
characters like spaces, tabs, and new lines. White space may be
inserted between symbols for disambiguation and to improve
readability. \index{subject}{white space}
%
A name such as $\Exp$ that is defined by the grammar rules is a
\emph{nonterminal}. \index{subject}{nonterminal}
%
The name $\Int$ is also a nonterminal, but instead of defining it with
a grammar rule, we define it with the following explanation. An
$\Int$ is a sequence of decimals ($0$ to $9$), possibly starting with
$-$ (for negative integers), such that the sequence of decimals
represents an integer in the range $-2^{62}$ to $2^{62}-1$. This
enables the representation of integers using 63 bits, which simplifies
several aspects of compilation.
%
\racket{Thus, these integers correspond to the Racket \texttt{fixnum}
datatype on a 64-bit machine.}
%
\python{In contrast, integers in Python have unlimited precision, but
the techniques needed to handle unlimited precision fall outside the
scope of this book.}
The second grammar rule is the \READOP{} operation, which receives an
input integer from the user of the program.
\begin{equation}
\Exp ::= \READ{} \label{eq:arith-read}
\end{equation}
The third rule categorizes the negation of an $\Exp$ node as an
$\Exp$.
\begin{equation}
\Exp ::= \NEG{\Exp} \label{eq:arith-neg}
\end{equation}
We can apply these rules to categorize the ASTs that are in the
\LangInt{} language. For example, by rule \eqref{eq:arith-int},
\INT{8} is an $\Exp$, and then by rule \eqref{eq:arith-neg} the
following AST is an $\Exp$.
\begin{center}
\begin{minipage}{0.5\textwidth}
\NEG{\INT{\code{8}}}
\end{minipage}
\begin{minipage}{0.25\textwidth}
\begin{equation}
\begin{tikzpicture}
\node[draw, circle] (minus) at (0, 0) {$\text{--}$};
\node[draw, circle] (8) at (0, -1.2) {$8$};
\draw[->] (minus) to (8);
\end{tikzpicture}
\label{eq:arith-neg8}
\end{equation}
\end{minipage}
\end{center}
The next two grammar rules are for addition and subtraction expressions:
\begin{align}
\Exp &::= \ADD{\Exp}{\Exp} \label{eq:arith-add}\\
\Exp &::= \SUB{\Exp}{\Exp} \label{eq:arith-sub}
\end{align}
We can now justify that the AST \eqref{eq:arith-prog} is an $\Exp$ in
\LangInt{}. We know that \READ{} is an $\Exp$ by rule
\eqref{eq:arith-read}, and we have already categorized
\NEG{\INT{\code{8}}} as an $\Exp$, so we apply rule \eqref{eq:arith-add}
to show that
\[
\ADD{\READ{}}{\NEG{\INT{\code{8}}}}
\]
is an $\Exp$ in the \LangInt{} language.
If you have an AST for which these rules do not apply, then the
AST is not in \LangInt{}. For example, the program \racket{\code{(*
(read) 8)}} \python{\code{input\_int() * 8}} is not in \LangInt{}
because there is no rule for the \key{*} operator. Whenever we
define a language with a grammar, the language includes only those
programs that are justified by the grammar rules.
{\if\edition\pythonEd
The language \LangInt{} includes a second nonterminal $\Stmt$ for statements.
There is a statement for printing the value of an expression
\[
\Stmt{} ::= \PRINT{\Exp}
\]
and a statement that evaluates an expression but ignores the result.
\[
\Stmt{} ::= \EXPR{\Exp}
\]
\fi}
{\if\edition\racketEd
The last grammar rule for \LangInt{} states that there is a
\code{Program} node to mark the top of the whole program:
\[
\LangInt{} ::= \PROGRAM{\code{'()}}{\Exp}
\]
The \code{Program} structure is defined as follows:
\begin{lstlisting}
(struct Program (info body))
\end{lstlisting}
where \code{body} is an expression. In further chapters, the \code{info}
part is used to store auxiliary information, but for now it is
just the empty list.
\fi}
{\if\edition\pythonEd
The last grammar rule for \LangInt{} states that there is a
\code{Module} node to mark the top of the whole program:
\[
\LangInt{} ::= \PROGRAM{}{\Stmt^{*}}
\]
The asterisk symbol $*$ indicates a list of the preceding grammar item, in
this case, a list of statements.
%
The \code{Module} class is defined as follows
\begin{lstlisting}
class Module:
def __init__(self, body):
self.body = body
\end{lstlisting}
where \code{body} is a list of statements.
\fi}
It is common to have many grammar rules with the same left-hand side
but different right-hand sides, such as the rules for $\Exp$ in the
grammar of \LangInt{}. As shorthand, a vertical bar can be used to
combine several right-hand sides into a single rule.
The concrete syntax for \LangInt{} is shown in
figure~\ref{fig:r0-concrete-syntax} and the abstract syntax for
\LangInt{} is shown in figure~\ref{fig:r0-syntax}.
\racket{The \code{read-program} function provided in
\code{utilities.rkt} of the support code reads a program from a file
(the sequence of characters in the concrete syntax of Racket) and
parses it into an abstract syntax tree. Refer to the description of
\code{read-program} in appendix~\ref{appendix:utilities} for more
details.}
\python{The \code{parse} function in Python's \code{ast} module
converts the concrete syntax (represented as a string) into an
abstract syntax tree.}
\newcommand{\LintGrammarRacket}{
\begin{array}{rcl}
\Type &::=& \key{Integer} \\
\Exp{} &::=& \Int{} \MID \CREAD \MID \CNEG{\Exp} \MID \CADD{\Exp}{\Exp}
\MID \CSUB{\Exp}{\Exp}
\end{array}
}
\newcommand{\LintASTRacket}{
\begin{array}{rcl}
\Type &::=& \key{Integer} \\
\Exp{} &::=& \INT{\Int} \MID \READ{} \\
&\MID& \NEG{\Exp} \MID \ADD{\Exp}{\Exp} \MID \SUB{\Exp}{\Exp}
\end{array}
}
\newcommand{\LintGrammarPython}{
\begin{array}{rcl}
\Exp &::=& \Int \MID \key{input\_int}\LP\RP \MID \key{-}\;\Exp \MID \Exp \; \key{+} \; \Exp \MID \Exp \; \key{-} \; \Exp \MID \LP\Exp\RP \\
\Stmt &::=& \key{print}\LP \Exp \RP \MID \Exp
\end{array}
}
\newcommand{\LintASTPython}{
\begin{array}{rcl}
\itm{binaryop} &::= & \code{Add()} \MID \code{Sub()} \\
\itm{unaryop} &::= & \code{USub()} \\
\Exp{} &::=& \INT{\Int} \MID \READ{} \\
&\MID& \UNIOP{\itm{unaryop}}{\Exp} \MID \BINOP{\itm{binaryop}}{\Exp}{\Exp} \\
\Stmt{} &::=& \PRINT{\Exp} \MID \EXPR{\Exp}
\end{array}
}
\begin{figure}[tp]
\begin{tcolorbox}[colback=white]
{\if\edition\racketEd
\[
\begin{array}{l}
\LintGrammarRacket \\
\begin{array}{rcl}
\LangInt{} &::=& \Exp
\end{array}
\end{array}
\]
\fi}
{\if\edition\pythonEd
\[
\begin{array}{l}
\LintGrammarPython \\
\begin{array}{rcl}
\LangInt{} &::=& \Stmt^{*}
\end{array}
\end{array}
\]
\fi}
\end{tcolorbox}
\caption{The concrete syntax of \LangInt{}.}