-
Notifications
You must be signed in to change notification settings - Fork 36
/
tidyRcheatsheet.tex
510 lines (395 loc) · 15.4 KB
/
tidyRcheatsheet.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
\documentclass[10pt,landscape]{article}
\usepackage{multicol}
\usepackage{calc}
\usepackage{ifthen}
\usepackage{amsmath}
\usepackage{lipsum}
\usepackage{booktabs}
\usepackage{xcolor}
\usepackage{threeparttable}
\usepackage[landscape]{geometry}
\definecolor{oucrimson}{RGB}{132,22,23} % OU crimson
% \definecolor{navy}{RGB}{132,22,23} % OU crimson
\usepackage{hyperref}
\hypersetup{
unicode=true,
bookmarksnumbered=true,
bookmarksopen=true,
bookmarksopenlevel=3,
breaklinks=true,
pdfborder={0 0 0},
colorlinks,
citecolor=oucrimson,
filecolor=oucrimson,
linkcolor=oucrimson,
urlcolor=oucrimson
}
% \makeatletter
% \renewcommand\verbatim@font{\itshape}
% \makeatother
% \makeatletter
% \renewcommand\verbatim@font{\itshape}
% \makeatother
% To make this come out properly in landscape mode, do one of the following
% 1.
% pdflatex latexsheet.tex
%
% 2.
% latex latexsheet.tex
% dvips -P pdf -t landscape latexsheet.dvi
% ps2pdf latexsheet.ps
% If you're reading this, be prepared for confusion. Making this was
% a learning experience for me, and it shows. Much of the placement
% was hacked in; if you make it better, let me know...
% This sets page margins to .5 inch if using letter paper, and to 1cm
% if using A4 paper. (This probably isn't strictly necessary.)
% If using another size paper, use default 1cm margins.
\ifthenelse{\lengthtest { \paperwidth = 11in}}
{ \geometry{top=.5in,left=.5in,right=.5in,bottom=.5in} }
{\ifthenelse{ \lengthtest{ \paperwidth = 297mm}}
{\geometry{top=1cm,left=1cm,right=1cm,bottom=1cm} }
{\geometry{top=1cm,left=1cm,right=1cm,bottom=1cm} }
}
% Turn off header and footer
\pagestyle{empty}
% Redefine section commands to use less space
\makeatletter
\renewcommand{\section}{\@startsection{section}{1}{0mm}%
{-1ex plus -.5ex minus -.2ex}%
{0.5ex plus .2ex}%x
{\normalfont\large\bfseries}}
\renewcommand{\subsection}{\@startsection{subsection}{2}{0mm}%
{-1explus -.5ex minus -.2ex}%
{0.5ex plus .2ex}%
{\normalfont\normalsize\bfseries}}
\renewcommand{\subsubsection}{\@startsection{subsubsection}{3}{0mm}%
{-1ex plus -.5ex minus -.2ex}%
{1ex plus .2ex}%
{\normalfont\small\bfseries}}
\makeatother
% Define BibTeX command
\def\BibTeX{{\rm B\kern-.05em{\sc i\kern-.025em b}\kern-.08em
T\kern-.1667em\lower.7ex\hbox{E}\kern-.125emX}}
% Don't print section numbers
\setcounter{secnumdepth}{0}
\setlength{\parindent}{0pt}
\setlength{\parskip}{0pt plus 0.5ex}
% -----------------------------------------------------------------------
\begin{document}
\raggedright
\footnotesize
\begin{multicols}{3}
% multicol parameters
% These lengths are set only within the two main columns
%\setlength{\columnseprule}{0.25pt}
\setlength{\premulticols}{1pt}
\setlength{\postmulticols}{1pt}
\setlength{\multicolsep}{1pt}
\setlength{\columnsep}{2pt}
\begin{center}
{\Large{\textbf{Econometrics in R's \texttt{tidyverse}}}} \\
{\footnotesize by Tyler Ransom, University of Oklahoma}
{\footnotesize \href{https://www.twitter.com/tyleransom}{\makeatletter @tyleransom \makeatother}}
\end{center}
\section{Basics of R}
R can be thought of as a really fancy calculator
\smallskip{}
\textbf{Packages:}\\
\begin{itemize}
\item R comes with a lot of functionality out-of-the-box
\item Other functionality requires the user to load packages
\item One-time installation: \verb!install.packages("tidyverse")!
\item Each time you open R: \verb!library(tidyverse)!
\end{itemize}
\smallskip{}
\textbf{Commenting:}\\
\begin{itemize}
\item Use \verb!#! to make a comment
\item This tells R to ignore that code
\item[] \verb!# My name is Tyler!
\end{itemize}
\smallskip{}
\textbf{Assignment operator:}\\
\begin{itemize}
\item Use \verb!<-! to store a calculation, e.g. \verb!x <- 3! (``$x=3$'')
\end{itemize}
\smallskip{}
\textbf{Pipe operator:}\\
\begin{itemize}
\item Use \verb!%>%! to ``pipe'' objects
\item[] \verb!y <- mean(log(x))! becomes \verb!y <- x %>% log %>% mean!
\item \verb!%<>%! pipes forward, then backwards
\item[] \verb!x <- mean(log(x))! is same as \verb!x %<>% log %>% mean!
\end{itemize}
\section{Working with Data}
R's fundamental data object is a \textbf{data frame}
\smallskip{}
Like spreadsheets, stores data in columns and rows
\smallskip{}
\verb!tidyverse! uses \textbf{tibbles} (enhanced data frames)
\begin{itemize}
\item[] \verb!df <- as_tibble(mtcars)!
\end{itemize}
\smallskip{}
\textbf{Reading in data}\\
\begin{itemize}
\item Many functions for reading in different types of data
\item[] \verb!df <- read_csv("myfile.csv")! (comma separated)
\item[] \verb!df <- read_fwf("myfile.dat")! (fixed-width)
\item More details: see \href{https://rawgit.com/rstudio/cheatsheets/master/data-import.pdf}{Data Importing Cheat Sheet}
\item \verb!haven! package: import foreign files (e.g. SAS, Stata, ...)
\end{itemize}
\smallskip{}
\textbf{Accessing columns of data}\\
\begin{itemize}
\item To reference a column in a tibble, use \verb!$!
\item[] \verb!df$mpg!
\item[] \verb!mean(df$mpg)! will return sample avg of \verb!mpg! variable
\end{itemize}
\smallskip{}
\textbf{Ignore missing values}\\
\begin{itemize}
\item Missing values are indicated by \verb!NA!
\item Some commands won't automatically ignore \verb!NA! values
\item For these cases, use \verb!na.rm! option
\item[] \verb!mean(df$mpg, na.rm=TRUE)!
\item[] \verb!df$mpg %>% mean(na.rm=TRUE)! (equivalent)
\item Otherwise, \texttt{R} would say the mean is \verb!NA!
\end{itemize}
\smallskip{}
\textbf{Removing columns and rows from a tibble}\\
\begin{itemize}
\item To keep columns in a tibble, use \verb!select()!
\item[] \verb!df1 <- df %>% select(mpg,disp,hp,gear,carb)!
\item To keep rows in a tibble, use \verb!filter()!
\item[] \verb!df1 %<>% filter(mpg>=10)!
\item To remove columns, put a minus in front
\item[] \verb!df1 <- df %>% select(-mpg,-disp)!
\end{itemize}
\smallskip{}
\textbf{Remove missing values from a tibble}\\
\begin{itemize}
\item To remove \textbf{all} rows with \textit{any} \verb!NA! values, use \verb!drop_na()!
\item[] \verb!df1 <- df %>% drop_na()!
\item Can also drop \verb!NA!'s from particular columns:
\item[] \verb!df1 <- df %>% drop_na(gear,carb)!
\end{itemize}
\smallskip{}
\textbf{Creating new columns in a tibble}\\
\begin{itemize}
\item To create a new column in a tibble, use \verb!mutate()!
\item[] \verb!df1 %<>% mutate(mpg.squared = mpg^2)!
\end{itemize}
\smallskip{}
\textbf{Manipulating values of a variable}\\
\begin{itemize}
\item To replace (i.e. recode) values of a variable:
\item[] \verb!df %<>% mutate(gear = replace(gear,gear==4,99))!
\item[] Changes all 4's in \verb!gear! to be 99's
\item[] \verb!gear==4! can be any other logical condition
\item To specify a series of conditions, use \verb!%in%!
\item[] \verb!df %<>% mutate(hp =!
\item[] \verb!replace(hp,hp %in% c(110,120),99))!
\item[] Changes all 110's or 120's in \verb!hp! to be 99's
\end{itemize}
\smallskip{}
\textbf{Working with discrete variables}\\
\begin{itemize}
\item Discrete variables often require special treatment
\item In R, declare discrete variables as \textbf{factors}
\item[] \verb!df %<>% mutate(gear = as.factor(gear))!
\end{itemize}
\smallskip{}
\textbf{Other data manipulations}\\
\begin{itemize}
\item See \href{https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf}{Data Wrangling Cheat Sheet}
\end{itemize}
\section{Getting to know your data}
It's important to know what's in your data by
\begin{enumerate}
\item Looking at summary statistics
\item Performing cross-tabulations
\item Visualizing certain variables
\end{enumerate}
\smallskip{}
\textbf{Summary statistics (\texttt{modelsummary} package)}\\
\begin{itemize}
\item Report quartiles, min/max, mean, sd, and \#\verb!NA!'s:
\item[] \verb!datasummary_skim(df, histogram=FALSE)!
or
\item[] \verb!df %>% datasummary_skim(histogram=FALSE)!
\end{itemize}
\smallskip{}
\textbf{Cross-tabulations}\\
\begin{itemize}
\item Report frequencies of a discrete variable:
\item[] \verb!table(df$gear)!
\item Average $y$ by categories of a discrete $x$ variable:
\item[] \verb!df %>% group_by(gear) %>% !
\item[] \verb!summarize(m.mpg = mean(mpg))!
\end{itemize}
\smallskip{}
\textbf{Visualization}\\
\begin{itemize}
\item Often helpful to look at a histogram or line graph
\item Histogram (continuous $x$):
\item[] \verb!ggplot(df, aes(mpg)) + geom_histogram()!
\item Histogram (factor $x$):
\item[] \verb!ggplot(df,aes(x=gear)) + geom_bar()!
\item Kernel density plot:
\item[] \verb!ggplot(df, aes(mpg)) + geom_density()!
\item Simple scatter plot with linear fit:
\item[] \verb!ggplot(df,aes(disp,mpg)) + geom_point() +!
\item[] \verb!geom_smooth(method="lm")!
\item More details: see \href{https://www.rstudio.com/wp-content/uploads/2015/03/ggplot2-cheatsheet.pdf}{ggplot2 Cheat Sheet}
\end{itemize}
\section{Regression modeling}
\smallskip{}
\textbf{Basic OLS regression}\\
\begin{itemize}
\item Regression:
\item[] \verb!est <- lm(mpg ~ gear + hp, data=df)!
\item Examine regression output:
\item[] \verb!summary(est)!
\item[] \verb!tidy(est)!
\item[] \verb!modelsummary(est)!
\item Other functional forms:
\item[] \verb!est <- lm(mpg ~ gear + I(gear^2), data=df)!
\item[] \verb!est <- lm(log(mpg) ~ gear + I(gear^2), data=df)!
\item Factor variables automatically get separate intercepts
\end{itemize}
\smallskip{}
\textbf{$t$-statics and $F$-statistics}\\
\begin{itemize}
\item $t$-stats, $p$-values reported in regression output
\item $F$-test:
\item[] \verb!linearHypothesis(est,c("gear","hp"))!
\item[] tests $H_0: \beta_{gear} = 0, \beta_{hp}=0$
\item[] \verb!linearHypothesis(est,c("gear=5","hp=-1"))!
\item[] tests $H_0: \beta_{gear} = 5, \beta_{hp}=-1$
\item Robust $F$-test (see next section):
\item[] \verb!linearHypothesis(est.rob,c("gear","hp"))!
\end{itemize}
\smallskip{}
\textbf{Robust standard errors (\texttt{estimatr} and \texttt{sandwich} packages)}\\
\begin{itemize}
\item Correct for heteroskedasticity:
\item[] \verb!est.rob <- lm_robust(mpg ~ gear + hp, data=df)!
\item[] \verb!modelsummary(est.rob)!
\item[]
\item Correct for serial correlation:
\item[] \verb!est <- lm(mpg ~ gear + hp, data=df)!
\item[] \verb!modelsummary(est, vcov=sandwich::NeweyWest(est))!
\item[]
\item Correct for clustering:
\item[] \verb!est.clust <- lm_robust(mpg ~ gear + hp, data=df,!
\item[] \verb!clusters=df$carb)!
\item[] \verb!modelsummary(est.clust)!
\end{itemize}
\section{Instrumental Variables}
\smallskip{}
\begin{itemize}
\item Let \verb!drat! be the endogenous covariate
\item Let \verb!wt! be the instrument
\item Let \verb!qsec! and \verb!gear! be exogenous covariates
\item[] \verb!est.iv <- ivreg(mpg ~ drat + qsec + gear | !
\item[] \verb!wt + qsec + gear, data=df)!
\item Instruments come after the \verb!|! symbol
\item Endogenous covariates come before the \verb!|! symbol
\item Exogenous covariates appear on both sides of the \verb!|!
\item First-stage regression:
\item[] \verb!est.1 <- lm(drat ~ wt + qsec + gear, data=df)!
\item[] \verb!df %<>% mutate(drat.hat = est.1$fitted.values)!
\item Second-stage regression:
\item[] \verb!est.2 <- lm(mpg ~ drat.hat + qsec+ gear,data=df)!
\item Can also use \verb!estimatr! for robust SEs:
\item[] \verb!est.ivr <- iv_robust(mpg ~ drat + qsec + gear | !
\item[] \verb!wt + qsec + gear, data=df)!
\end{itemize}
\section{Working with time series data}
\smallskip{}
\begin{itemize}
\item Declare a time series data frame
\item[] \verb!df.ts <- as_tsibble(df, key=id, index=year)!
\item Time series line plot:
\item[] \verb!ggplot(df.ts, aes(year, inf)) + geom_line()!
\item Simple AR(1) model:
\item[] \verb!est <- lm(inf ~ lag(inf,1), data=df.ts)!
\item First-differences model:
\item[] \verb!est.diff <- lm(difference(inf) ~ unem,!
\item[] \qquad\qquad\qquad\verb!data = df.ts)!
\item ADF test for unit root:
\item[] \verb!adf.test(df1.ts$inf, k=1)!
\item ARIMA model:
\item[] \verb!est.arima <- auto.arima(df.ts$inf)!
\item Plot $h$-period-ahead forecast intervals
\item[] \verb!autoplot(forecast(est.arima, h=2))!
\item Extended date and time functions available in \href{https://evoldyn.gitlab.io/evomics-2018/ref-sheets/R_lubridate.pdf}{\texttt{lubridate} package}
\end{itemize}
\section{Working with panel data}
\smallskip{}
\begin{itemize}
\item Report number of units and time periods
\item[] \verb!pdim(df)!
\item Pooled OLS model
\item[] \verb!est.pols <- plm(lwage ~ exper + I(exper^2) +!
\item[] \verb!year, data = df, index = c("id","year"),!
\item[] \verb!model = "pooling")!
\item Random effects model
\item[] \verb!est.re <- plm(lwage ~ exper + I(exper^2) +!
\item[] \verb!year, data = df, index = c("id","year"),!
\item[] \verb!model = "random")!
\item Fixed effects model
\item[] \verb!est.fe <- plm(lwage ~ exper + I(exper^2) +!
\item[] \verb!year, data = df, index = c("id","year"),!
\item[] \verb!model = "within")!
\item First differences model
\item[] \verb!est.fd <- plm(lwage ~ exper + I(exper^2) +!
\item[] \verb!year, data = df, index = c("id","year"),!
\item[] \verb!model = "fd")!
\end{itemize}
\section{Limited dependent variable models}
\smallskip{}
\textbf{Linear probability model (LPM):}\\
\begin{itemize}
\item If $y$ is a factor, format it as a numeric
\item[] \verb!est.lpm <- lm(as.numeric(y) ~ x1 + x2, data=df)!
\end{itemize}
\textbf{Logit and Probit:}\\
\smallskip{}
In this case, $y$ should be formatted as a factor
\smallskip{}
\begin{itemize}
\item Logit:
\item[] \verb!est.logit <- glm(y ~ x1 + x2,!
\item[] \verb!family=binomial(link="logit"),data=df)!
\item Probit:
\item[] \verb!est.probit <- glm(y ~ x1 + x2,!
\item[] \verb!family=binomial(link="probit"),data=df)!
\end{itemize}
% \textcolor{white}{\lipsum[1-12]}
\section{List of packages}
The document requires the following packages:
\smallskip{}
\begin{multicols}{4}
\verb!tidyverse! \\
\verb!magrittr! \\
\verb!broom! \\
\verb!modelsummary! \\
\verb!car! \\
\verb!estimatr! \\
\verb!lmtest! \\
\verb!clubSandwich! \\
\verb!sandwich! \\
\verb!AER! \\
\verb!tsibble! \\
\verb!tseries! \\
\verb!lubridate! \\
\verb!forecast! \\
\verb!plm! \\
\end{multicols}
\rule{0.3\linewidth}{0.25pt}
\scriptsize
Layout by Winston Chang, \href{http://wch.github.io/latexsheet/}{http://wch.github.io/latexsheet/}
\end{multicols}
\end{document}