-
Notifications
You must be signed in to change notification settings - Fork 278
/
Copy pathreviewNotes.tex
751 lines (730 loc) · 36.9 KB
/
reviewNotes.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
\documentclass[12pt]{article}
\usepackage{geometry,amsmath,amssymb, graphicx, natbib, float, enumerate}
\geometry{margin=1in}
%\renewcommand{\familydefault}{cmss}
\usepackage{charter}
\restylefloat{table}
\restylefloat{figure}
\newcommand{\code}[1]{\texttt{#1}}
\newcommand{\Var}{\mathrm{Var}}
\newcommand{\logit}{\mathrm{logit}}
\newcommand{\RQ}{[{\bf REQUIRED}]~}
\begin{document}
\section*{On these review notes}
\begin{enumerate}[1.]
\item You are responsible for the correctness of all of the formulae
on this review sheet. (There are undoubtedly ytopgraphical errors :-).
\end{enumerate}
\newpage
% {\bf Formula sheet for the exam}\\
% Here is a list of the formulae that you will be given on the exam.
% Again, {\em you} are responsible for the correctness of these
% formulae.
% \begin{enumerate}[1.]
% \item $P(A | B) = P(A \cap B) / P(B)$
% \item Bayes rule:$P(A | B) = \frac{P(B | A) P(A)}{P(B | A) P(A) + P(B | A^c) P(A^c)}.$
% \item $E[X] = \sum X p(x)$ discrete $E[X] = \int x f(x) dx$ continuous.
% \item $\mathrm{Var}(X) = E[(X - \mu)^2] = E[X^2] - E[X]^2$.
% \item $P(X = k) = \left(\begin{array}{c} n \\ k \end{array}\right) p^k(1 - p)^{n - k}.$
% \item $(2 \pi \sigma^2)^{-1/2}\exp\{-(x - \mu)^2 / 2 \sigma^2\}$.
% \item $S^2 = \sum (X_i - \bar X)^2 / (N - 1)$.
% \item $N = (Z_{1-\alpha} + Z_{1 - \beta})^2\sigma^2 / \delta^2$.
% \item $\bar X \pm t_{n-1,1-\alpha/2} \frac{S}{\sqrt{n}}$.
% \item $\frac{\bar{X} - \mu_0}{S / \sqrt{n}}$.
% \item $\frac{\hat p - p}{\sqrt{p_0 (1 - p_0) / n}}$.
% \item $\hat p \pm Z_{1 - \alpha / 2} \sqrt{\hat p (1 - \hat p) / n}$.
% \item $\frac{\hat p - p_0}{\sqrt{p_0 (1 - p_0) / n}}$,
% $\frac{\hat p - p_0}{\sqrt{\hat p (1 - \hat p) / n}}$.
% \item $\hat p \left(\frac{n}{n + Z_{1 - \alpha / 2}^2}\right) +
% \frac{1}{2} \left(\frac{Z_{1 - \alpha / 2}^2}{n + Z_{1 - \alpha / 2}^2}\right)
% \pm Z_{1 - \alpha/2}\sqrt{\frac{1}{n + Z_{1 - \alpha / 2}^2}
% \left[\hat p (1 - \hat p) \left(\frac{n}{n + Z_{1 - \alpha / 2}^2}\right) +
% \frac{1}{4} \left(\frac{Z_{1 - \alpha / 2}^2}{n + Z_{1 - \alpha / 2}^2}\right)
% \right]}.$
% \item $\frac{\bar X - \bar Y}{S_p \sqrt{\frac{1}{n_x} + \frac{1}{n_y}}}$.
% \item $\frac{\bar X - \bar Y}{\sqrt{\frac{S_x^2}{n_x} + \frac{S_y^2}{n_y}}}$.
% \item $F = \frac{S_1^2}{S_2^2}$.
% \item $P\left(Z \geq \frac{\mu_0 - \mu_1}{\sigma / \sqrt{n}} + Z_{1 - \alpha} \right)$,
% $P\left(Z \leq \frac{\mu_0 - \mu_1}{\sigma / \sqrt{n}} - Z_{1 - \alpha} \right)$.
% \item $S_p^2 = \frac{(n_x - 1) S_x^2 + (n_y - 1) S_y^2}{n_x + n_y -
% 2}$
% \end{enumerate}
\newpage
\section{Set theory}
\begin{enumerate}[1.]
\item Notation - $\subset$ means ``is a subset of'', $\in$ means ``is an element of''.
\item The {\bf sample space}, $\Omega$, is the space of all possible outcomes of an experiment.
\item An {\bf event}, say $A \subset \Omega$, is subset of $\Omega$.
\item The {\bf union} of two events, $A \cup B$, is the collection of elements that are in $A$, $B$ or both.
\item The {\bf intersection} of two events, $A \cap B$, is the collection of elements that are in both $A$ and $B$.
\item The {\bf compliment} of an event, say $\bar A$ or $A^c$, is all of the elements of $\Omega$ that are not in $A$.
\item The {\bf null} or {\bf empty} set is denoted $\emptyset$.
\item Two sets are {\bf disjoint} or {\bf mutually exclusive}
if their intersection is empty, $A\cap B = \emptyset$.
\item {\bf DeMorgan's laws} state that $(A\cup B)^c = A^c \cap B^c$ and
$(A\cap B)^c = A^c \cup B^c$.
\end{enumerate}
\section{Probability basics}
\begin{enumerate}[1.]
\item A {\bf probability measure}, say $P$, is a function on the
collection of events to $[0,1]$ so that:
\begin{enumerate}[a.]
\item $P(\Omega) = 1$.
\item If $A\subset \Omega$ then $P(A) \geq 0$.
\item If $A_1,\ldots, A_n$ are disjoint then ({\bf finite additivity})
$
P(\cup_{i=1}^n A_i) = \sum_{i=1}^n P(A_i).
$
\end{enumerate}
\item $P(\bar A) = 1 - P(A)$.
\item The {\bf odds} of an event, $A$, are $P(A) / (1 - P(A)) = P(A) / P(\bar A)$.
\item $P(A \cup B) = P(A) + P(B) - P(A\cap B)$.
\item If $A \subset B$ then $P(A) \leq P(B)$.
\item Two events $A$ and $B$ are {\bf independent} if $P(A\cap B) = P(A) P(B)$. A collection of
events, $A_i$, are {\bf mutually independent} if $P(\cap_{i=1}^n A_i) = \prod_{i=1}^n P(A_i)$.
\item Pairwise independence of a collection of events does not imply
mutually independence, though the reverse is true.
\item Given that $P(B) > 0$, the conditional probability of $A$ given that $B$ has occurred is
$P(A | B) = P(A \cap B) / P(B)$.
\item Two events $A$ and $B$ are {\bf independent} if $P(A | B) = P(A)$.
\item The {\bf law of total probability} states that if $A_i$ are a
collection of {\em mutually exclusive events} so that $\Omega = \cup_{i=1}^n A_i$,
then $P(C) = \sum_{i=1}^n P(C | A_i)P(A_i)$ for any event $C$.
\item {\bf Baye's rule} states that if $A_i$ are a collection of
{\em mutually exclusive events} so that $\Omega = \cup_{i=1}^n
A_i$, then
$$
P(A_j | C) = \frac{P(C | A_j) P(A_j)}{\sum_{i=1}^n P(C|A_i)P(A_i)}.
$$
for any set $C$ (with positive probability). Notice $A$ and
$\bar A$ are disjoint and $A\cup A^c = \Omega$ so that we have
$$
P(A | B) = \frac{P(B | A) P(A)}{P(B | A) P(A) + P(B | A^c) P(A^c)}.
$$
\item The {\bf sensitivity} of a diagnostic test is defined to
be $P(+ | D)$ where $+$ ($-$) is the event of a positive
(negative) test result and $D$ is the event that a subject has the
disease in question. The {\bf specificity} of a diagnostic test is
$P(- | \bar D)$.
\item Baye's rule yields that
$$
P(D | + ) = \frac{P(+ | D) P(D)}{P(+ | D) P(D) + P(+ | D^c) P(D^c)},
$$
and
$$
P(D^c | -) = \frac{P(- | D^c) P(D^c)}{P(- | D^c) P(D^c) + P(- | D) P(D)}.
$$
\item The {\bf likelihood ratio} of a positive test result is
$P(+ | D) / P(+ | \bar D) = \mbox{sensitivity} / (1 - \mbox{specificity})$.
The likelihood ratio of a negative test result is
$P(- | \bar D) / P(- | D) = \mbox{specificity} / (1 - \mbox{sensitivity})$.
\item The odds of disease after a positive test are related to the odds of disease
before the test by the relation
$$
\frac{P(D | +)}{P(D^c | +)} = \frac{P(+ | D)}{P(+ | D^c)} \frac{P(D)}{P(D^c)}.
$$
That is, the posterior odds equal the prior odds times the likelihood ratio.
Correspondingly,
$$
\frac{P(D^c | -)}{P(D | -)} = \frac{P(- | D^c)}{P(- | D)} \frac{P(D^c)}{P(D)}.
$$
This yields a method for evaluating the results of a diagnostic test without
knowledge of the disease prevalence.
\end{enumerate}
\section{Random variables}
\begin{enumerate}[1.]
\item A {\bf random variable} is a function from $\Omega$ to the real numbers.
A random variable is a random number that is the result of an experiment
governed by a probability distribution.
\item A {\bf Bernoulli} random
variable is one that takes the value 1 with probability $p$ and 0
with probability $(1 - p)$. That is, $P(X = 1) = p$ and $P(X = 0) =
1 - p$.
\item A {\bf probability mass function} (pmf) is a function that yields the various
probabilities associated with a random variable. For example, the probability
mass function for a Bernoulli random variable is $f(x) = p^x(1 - p)^{1 - x}$ for $x = 0, 1$
as this yields $p$ when $x = 1$ and $(1 - p)$ when $x = 0$.
\item The {\bf expected value} or (population) {\bf mean}
of a discrete random variable, $X$, with pmf $f(x)$ is
$$
\mu = E[X] = \sum_{x} x f(x).
$$
The mean of a Bernoulli variable is then $1 f(1) + 0 f(0) = p$.
\item The {\bf variance} of any random variable, $X$, (discrete or continuous) is
$$
\sigma^2 = E\left[(X - \mu)^2\right] = E[X^2] - E[X]^2.
$$
The latter formula being the most convenient for computation. The variance of
a Bernoulli random variable is $p(1-p)$.
\item The (population) {\bf standard deviation}, $\sigma$, is the
square root of the variance.
\item {\bf Chebyshev's inequality} states that for any random variable
$P(|X - \mu| \geq K\sigma) \leq 1 / K ^ 2$. This yields a way to
interpret standard deviations.
\item A {\bf Binomial} random variable, $X$, is obtained as the sum of $n$ Bernoulli
random variables and has pmf
$$
P(X = k) = \left(\begin{array}{c} n \\ k \end{array}\right) p^k(1 - p)^{n - k}.
$$
Binomial random variables have expected value $np$ and variance $np(1-p)$.
\end{enumerate}
\section{Continuous random variables}
\begin{enumerate}[1.]
\item {\bf Continuous} random variables take values on a continuum.
\item The probability that a continuous random variable takes on any specific
value is 0.
\item Probabilities associated with continuous random variables are governed by
{\bf probability density functions} (pdfs). Areas under probability density
functions correspond to probabilities. For example, if $f$ is a pdf corresponding
to random variable $X$, then
$$
P(a \leq X \leq b) = \int_a^b f(x)dx.
$$
To be a pdf, a function must be positive and integrate to 1.
That is, $\int_{-\infty}^{\infty} f(x)dx = 1$
\item If $h$ is a positive function such that $\int_{-\infty}^{\infty}
h(x)dx \leq \infty$ then $f(x) = h(x) / \int_{-\infty}^{\infty}
h(x)dx$ is a valid density. Therefore, if we only know a density up to a constant
of proportionality, then we can figure out the exact density.
\item The expected value, or mean, of a continuous random variable,
$X$, with pdf $f$, is
$$
\mu = E[X] = \int_{-\infty}^{\infty} t f(t) dt.
$$
\item The variance is $\sigma^2 = E[(X - \mu)^2] = E[X^2]-E[X]^2$.
\item The {\bf distribution function}, say $F$, corresponding to a random variable $X$ with
pdf, $f$, is
$$
P(X \leq x) = F(x) = \int_{-\infty}^x f(t)dt.
$$
(Note the common convention that $X$ is used when describing an unobserved random variable
while $x$ is for specific values.)
\item The $p^{th}$ {\bf quantile} (for $0\leq p \leq 1$), say $X_p$,
of a distribution function, say $F$, is the point so that $F(X_p) =
p$. For example, the $.025^{th}$ quantile of the standard normal
distribution is -1.96.
\end{enumerate}
\section{Properties of expected values and variances}
The following properties hold for all expected values (discrete or continuous)
\begin{enumerate}[1.]
\item Expected values commute across sums: $E[X + Y] = E[X] + E[Y]$.
\item Multiplicative and additive constants can be pulled out
of expected values $E[cX] = cE[X]$ and $E[c + X] = c + E[X]$.
\item For independent random variables, $X$ and $Y$, $E[XY] = E[X]E[Y]$.
\item In general, $E[h(X)] \neq h(E[X])$.
\item Variances commute across sums {\em for independent variables}
$\mathrm{Var}(X + Y) = \mathrm{Var}(X) + \mathrm{Var}(Y)$.
\item Multiplicative constants are squared when pulled out of variances
$\mathrm{Var}(cX) = c^2 \mathrm{Var}(X)$.
\item Additive constants do not change variances: $\mathrm{Var}(c + X)
= \mathrm{Var}(X)$.
\end{enumerate}
\section{The normal distribution}
\begin{enumerate}[a.]
\item The {\bf Bell curve} or {\bf normal} or {\bf Gaussian} density is the
most common density. It is specified by its mean, $\mu$, and variance, $\sigma^2$.
The density is given by $f(x) = (2 \pi \sigma^2)^{-1/2}\exp\{-(x - \mu)^2 / 2 \sigma^2\}$.
We write $X\sim \mathrm{N}(\mu, \sigma^2)$ to denote that $X$ is normally distributed
with mean $\mu$ and variance $\sigma^2$.
\item The {\bf standard normal} density, labeled $\phi$, corresponds to a normal density
with mean $\mu = 0$ and variance $\sigma^2 = 1$.
$$
\phi(z) = (2 \pi)^{-1/2}\exp\{-z^2 / 2\}.
$$
The standard normal distribution function is usually labeled $\Phi$.
\item If $f$ is the pdf for a $\mathrm{N}(\mu,\sigma^2)$ random variable, $X$,
then note that $f(x) = \phi\{(x - \mu) / \sigma\} / \sigma$.
Correspondingly, if $F$ is the associated distribution
function for $X$, then $F(x) = \Phi\{(x - \mu) / \sigma\}$.
\item If $X$ is normally distributed with mean $\mu$ and variance
$\sigma^2$ then the random variable $Z = (X - \mu) / \sigma$ is standard normally
distributed. Taking a random variable subtracting its mean and
dividing by its standard deviation is called ``standardizing'' a
random variable.
\item If $Z$ is standard normal then $X = \mu + Z \sigma$ is normal with mean
$\mu$ and variance $\sigma^2$.
\item 68\%, 95\% and 99\% of the mass of any normal distribution lies
within 1, 2 and 3 (respectively) standard deviations from the mean.
\item $Z_\alpha$ refers to the $\alpha^{th}$ quantile of the standard normal
distribution. $Z_{.90}$, $Z_{.95}$, $Z_{.975}$ and $Z_{.99}$ are
1.28, 1.645, 1.96 and 2.32.
\item Sums and means of normal random variables are normal (regardless of whether or not
they are independent). You can use the rules for expectations and variances to
figure out $\mu$ and $\sigma$.
\item The sample standard deviation of iid normal random variables, appropriated
normalized, is a Chi-squared random variable (see below).
\end{enumerate}
\section{Sample means and variances}
Throughout this section let $X_i$ be a collection of iid random
variables with mean $\mu$ and variance $\sigma^2$.
\begin{enumerate}[1.]
\item We say random variables are {\bf iid} if they are independent and
identically distributed.
\item For random variables, $X_i$, the {\bf sample mean} is $\bar X = \sum_{i=1}^nX_i / n$.
\item $E[\bar X] = \mu = E[X_i]$ (does not require the independence or constant variance).
\item If the $X_i$ are iid with variance $\sigma^2$ then
$\mathrm{Var}(\bar X) = \mathrm{Var}(X_i) / n = \sigma^2 / n$.
\item The {\bf sample variance} is defined to be
$$
S^2 = \frac{\sum_{i=1}^n (X_i - \bar X)^2}{n - 1}.
$$
\item $\sum_{i=1}^n (X_i - \bar X)^2 = \sum_{i=1}^n X_i^2 - n\bar X^2$ is a shortcut
formula for the numerator.
\item $\sigma / {\sqrt n}$ is called the {\bf standard error} of $\bar X$. The estimated
standard error of $\bar X$ is $S / \sqrt{n}$. Do not confuse dividing by this $\sqrt n$
with dividing by $n-1$ in the calculation of $S^2$.
\item An estimator is {\bf unbiased} if its expected value equals the parameter
it is estimating.
\item $E[S^2] = \sigma^2$, which is why we divide by $n-1$ instead of
$n$. That is, $S^2$ is unbiased. However, dividing by $n-1$
rather than $n$ does increase the variance of this estimator
slightly, $\mathrm{Var}(S^2) \geq \mathrm{Var}((n-1)S^2 / n)$.
\item If the $X_i$ are normally distributed with mean $\mu$ and variance $\sigma^2$,
then $\bar X$ is normally distributed with mean $\mu$ and variance $\sigma^2 / n$.
\item The {\bf Central Limit Theorem}. If the $X_i$ are iid with
mean $\mu$ and (finite) variance $\sigma^2$ then
$$
Z = \frac{\bar X - \mu}{\sigma / \sqrt n}
$$
will limit to a standard normal distribution. The result is true for small sample sizes,
if the $X_i$ iid normally distributed.
\item If we replace $\sigma$ with $S$; that is,
$$
Z = \frac{\bar X - \mu}{S / \sqrt n},
$$
then $Z$ still limits to a standard normal. If the $X_i$ are iid normally distributed,
then $Z$ follows the Students $T$ distribution for small $n$.
\end{enumerate}
\section{Confidence intervals for a mean using the CLT.}
\begin{enumerate}[1.]
\item Using the CLT, we know that
$$
P\left(-Z_{1 - \alpha / 2} \leq \frac{\bar X - \mu}{S / \sqrt n} \leq Z_{1 - \alpha / 2}\right)
= 1 - \alpha
$$
for large $n$. Solving the inequalities for $\mu$, we calculated that in repeated
sampling, the interval
$$
\bar X \pm Z_{1 - \alpha / 2} \frac{S}{\sqrt{n}}
$$
will contain $\mu$ 100$(1-\alpha)$\% of the time.
\item The probability that $\mu$ is in an observed confidence interval is either 1 or 0.
The correct interpretation is that in repeated sampling, the interval we obtain
will contain $\mu$ 100$(1 - \alpha)\%$ of the time. (Assumes that the CLT has kicked in).
\item As $n$ increases, the interval gets narrower.
\item As $S$ increases, the interval gets wider.
\item As the {\bf confidence level}, $(1-\alpha)$, increases, the interval gets wider.
\item Fixing the confidence level controls the {\bf accuracy} of the
interval. A 95\% interval has 95\% coverage regardless of the sample
size. (Again, assuming that the CLT has kicked in.) Increasing $n$
will improve the precision (width) of the interval.
\item Prior to conducting a study, you can fix the {\bf margin of
error} (half width), say $\delta$, of the interval by setting $n =
(Z_{1 - \alpha / 2} \sigma / \delta)^2$. Round up. Requires an estimate of $\sigma$.
\end{enumerate}
\section{Confidence intervals for a variance and T confidence intervals}
\begin{enumerate}[1.]
\item If $X_i$ are iid normal random variables with mean $\mu$ and variance
$\sigma^2$ then $\frac{(n - 1)S^2}{\sigma^2}$
follows what is called a Chi-squared distribution with $n-1$ degrees of freedom.
\item Using the previous item, we know that
$$
P\left(\chi^2_{n-1,\alpha/2} \leq \frac{(n-1)S^2}{\sigma^2} \leq \chi^2_{n-1,1-\alpha/2}
\right) = 1 - \alpha,
$$
where $\chi^2_{n-1,\alpha}$ denotes the $\alpha^{th}$ quantile of the Chi-squared
distribution. Solving these inequalities for $\sigma^2$ yields
$$
\left[\frac{(n-1)S^2}{ \chi^2_{n-1,1-\alpha/2}},\frac{(n-1)S^2}{\chi^2_{n-1,\alpha/2}}\right]
$$
is a 100$(1 - \alpha)$\% confidence interval for $\sigma^2$. Recall this assumes that
the $X_i$ are iid Gaussian random variables.
\item The fact that $(n - 1) S^2 \sim \mbox{Gamma}((n - 1) / 2, 2\sigma^2)$ can
be used to create a likelihood interval for $\sigma$ or $\sigma^2$.
\item Chi-squared tests, intervals and likelihood intervals for
variances are not robust to the normality assumption.
\item If $Z$ is standard normal and $X$ is and independent Chi-squared
with $df$ degrees of freedom then $\frac{Z}{\sqrt{X / df}}$ follows
what is called a Student's $T$ distribution with $df$ degrees of freedom.
\item The Student's $T$ density looks like a normal density with heavier
tails (so it looks more squashed down).
\item By the previous item, if the $X_i$ are iid $\mathrm{N}(\mu,\sigma^2)$ then
$$
Z = \frac{\bar X - \mu}{S / \sqrt n}
$$
follows a Student's $T$ distribution with $(n-1)$ degrees of freedom. Therefore
if $t_{n-1,\alpha}$ is the $\alpha^{th}$ quantile of the Student's $T$ distribution
then
$$
\bar X \pm t_{n-1,1-\alpha/2} \frac{S}{\sqrt{n}}
$$
is a 100$(1 - \alpha)$\% confidence interval for $\mu$.
\item The Student's $T$ confidence interval assumes normality of the
$X_i$. However, the $T$ distribution has quite heavy tails and so the
interval is conservative and works well in many situations.
\item For large sample sizes, the Student's $T$ and CLT based intervals are
nearly the same because the Student's $T$ quantiles become more and more
like standard normal quantiles as $n$ increases.
\item For small sample sizes, it is difficult to diagnose normality/lack of normality.
Regardless, the robust T interval should be your default option.
\item The fact that $\sqrt{n} \bar X / S $ is non-central $T$ with
$n-1$ degrees of freedom and non-centrality parameter $\sqrt{n}\mu/\sigma$ can be
used to create a likelihood interval for the effect size $\mu / \sigma$.
\item Assuming the underlying normality of the data,
the profile likelihood for $\mu$ is $\left(\sum (x_i - \mu)^2\right)^{-n/2}$.
\end{enumerate}
\section{EDA}
\begin{enumerate}[1.]
\item The $p^{th}$ {\bf empirical quantile} of a data set is that
point so that $100p\%$ of the data lies below it. The sample {\bf
median} is the $.50^{th}$ quantile. Empirical quantiles estimate
population quantiles.
\item A {\bf boxplot} plots a box with a centerline at the sample median
and the box edges at the lower and upper quartiles. ``Whiskers'' extend
to the largest data point that is within 1.5 of the IQR (inter quartile range). Side
by side boxplots are useful to compare groups.
\item A {\bf quantile-quantile} (qq) plot, plots empirical quantiles
versus the theoretical quantiles. For normal random variables with
mean $\mu$ and variance $\sigma^2$, let $X_p$ be the $p^{th}$
quantile. Then, $X_p = \mu + Z_p \sigma$. Therefore plotting the empirical
quantiles versus the standard normal quantiles can be used to diagnose
non-normality (a {\bf normal qq} plot). Any deviation from a straight line
indicates non-normality.
\item {\bf Kernel density estimates}, {\bf histograms} and {\bf stem and leaf}
plots show estimates of the density. Each relies on tuning parameters that
you should vary. KDEs and histograms should only be used if you have enough
data.
\end{enumerate}
\section{The bootstrap}
\begin{enumerate}[1.]
\item The (non-parametric) {\bf bootstrap} can be used to calculate
{\bf percentile bootstrap confidence intervals}.
\item The {\bf bootstrap principle} is to use the empirical distribution
defined by the data to obtain an estimate of the sampling distribution
of a statistic. In practice the bootstrap principle is always executed
by {\bf resampling} from the observed data.
\item Assume that we have $n$ data points. The bootstrap obtains a
confidence interval by sampling $m$ complete data sets by drawing
with replacement from the original data. The statistic of interest,
say the median, is applied to all $m$ of the resampled data sets, yielding
$m$ medians. The percentile confidence interval is obtained by taking
the $\alpha / 2$ and $1 - \alpha /2$ quantiles of the $m$ medians.
\item Make sure you do enough resamples so that your confidence interval
has stabilized.
\item Bootstrap intervals are interpreted the same as frequentist intervals.
\item To guarantee coverage, the bootstrap interval requires large
sample sizes.
\item There are improvements to the percentile method that are not covered in
this class.
\end{enumerate}
\section{The log-normal distribution}
\begin{enumerate}[1.]
\item We use ``$\log$'' to represent the natural logarithm (base $e$).
\item A random variable $X$ is log-normal with parameters $\mu$ and
$\sigma^2$ if $Y = \log X$ is normal with mean $\mu$ and variance
$\sigma^2$.
\item $\mu$ is $E[Y] = E[\log X]$. Because the mean and median are
the same for the normal distribution, $\mu$ is also the median for $\log X$.
Notice that $\exp\{E[\log X]\} = e^\mu \neq E[X]$. However, because
$\mu$ is the median for $\log X$
$$
.5 = P(\log X \leq \mu) = P(Y \leq e^\mu).
$$
Therefore $e^\mu$ is also the median on the original data scale.
\item Assuming log-normality, exponentiating a Student's $T$ confidence
interval for $\mu$ (using the logged data) yields a confidence
for the median on the original data scale.
\end{enumerate}
% \section{Hypothesis testing for a single mean}
% \begin{enumerate}[1.]
% \item The null, or status quo, hypothesis is labeled $H_0$, the alternative
% $H_a$ or $H_1$ or $H_2$ ...
% \item A {\bf type I error} occurs when we falsely reject the null hypothesis.
% The probability of a type I error is usually labeled $\alpha$.
% \item A {\bf type II error} occurs when we falsely fail to reject the null
% hypothesis. A type II error is usually labeled $\beta$.
% \item A {\bf Power} is the probability that we correctly
% reject the null hypothesis, $1 - \beta$.
% \item The $Z$ test for $H_0:\mu = \mu_0$ versus $H_1: \mu < \mu_0$ or $H_2: \mu \neq \mu_0$ or
% $H_3: \mu > \mu_0$ constructs a test statistic
% $
% TS = \frac{\bar{X} - \mu_0}{S / \sqrt{n}}
% $
% and rejects the null hypothesis when
% \begin{enumerate}[$H_1$]
% \item $TS \leq -Z_{1 - \alpha}$
% \item $|TS| \geq Z_{1 - \alpha / 2}$
% \item $TS \geq Z_{1 - \alpha}$
% \end{enumerate}
% respectively.
% \item The $Z$ test requires the assumptions of the CLT and for $n$ to be large enough
% for it to apply.
% \item If $n$ is small, then a Student's $T$ test is performed exactly in the same way,
% with the normal quantiles replaced by the appropriate Student's $T$ quantiles and
% $n-1$ df.
% \item Tests define confidence intervals by considering the collection of values
% of $\mu_0$ for which you fail to reject a two sided test. This yields exactly the
% $T$ and $Z$ confidence intervals respectively.
% \item Conversely, confidence intervals define tests by the rule where one rejects
% $H_0$ if $\mu_0$ is {\em not in} the confidence interval.
% \item A {\bf P-value} is the probability of getting evidence as extreme or more extreme
% than we actually got under the null hypothesis. For $H_3$ above, the P-value is calculated
% as $P(Z \geq TS_{obs} | \mu = \mu_0)$ where $TS_{obs}$ is the observed value of our
% test statistic. To get the P-value for $H_2$, calculate a one sided P-value and double it.
% \item The P-value is equal to the {\bf attained significance level}.
% That is, the smallest $\alpha$ value for which we would have
% rejected the null hypothesis. Therefore, rejecting the null
% hypothesis if a P-value is less than $\alpha$ is the same as
% performing the rejection region test.
% \item The power of a $Z$ test for $H_3$ is given by the formula (know how this is obtained)
% $$
% P(TS > Z_{1 - \alpha} | \mu = \mu_1) =
% P\left(Z \geq \frac{\mu_0 - \mu_1}{\sigma / \sqrt{n}} + Z_{1 - \alpha} \right).
% $$
% Notice that power required a value for $\mu_1$, the value under
% the null hypothesis. Correspondingly for $H_1$ we have
% $$
% P\left(Z \leq \frac{\mu_0 - \mu_1}{\sigma / \sqrt{n}} - Z_{1 - \alpha} \right).
% $$
% For $H_2$, the power is approximately the appropriate one sided power using $\alpha/2$.
% \item Some facts about power.
% \begin{enumerate}[a.]
% \item Power goes up as $\alpha$ goes down.
% \item Power of a one sided test is greater than the power of the associated two sided test.
% \item Power goes up as $\mu_1$ gets further away from $\mu_0$.
% \item Power goes up as $n$ goes up.
% \end{enumerate}
% \item The prior formula can be used to calculate the sample size. For
% example, using the power formula for $H_1$, setting $Z_{1 - \beta} =
% \frac{\mu_0 - \mu_1}{\sigma / \sqrt{n}} - Z_{1 - \alpha}$ yields
% $$
% n = \frac{(Z_{1 - \beta} + Z_{1 - \alpha}) ^ 2 \sigma^2}{(\mu_0 - \mu_1)^2},
% $$
% which gives the sample size to have power = $1-\beta$. This
% formula applies for $H_3$ also. For the two sided test, $H_2$, replace $\alpha$ by
% $\alpha / 2$.
% \item Determinants of sample size.
% \begin{enumerate}[a.]
% \item $n$ gets larger as $\alpha$ gets smaller.
% \item $n$ gets larger as the power you want gets larger.
% \item $n$ gets lager the closer $\mu_1$ is to $\mu_0$.
% \end{enumerate}
% \end{enumerate}
\section{Binomial confidence intervals and tests}
\begin{enumerate}[1.]
\item Binomial distributions are used to model proportions. If
$X \sim \mathrm{Binomial}(n,p)$ then $\hat p = X / n$ is a sample
proportion.
\item $\hat p$ has the following properties.
\begin{enumerate}[a.]
\item It is a sample mean of Bernoulli random variables.
\item It has expected value $p$.
\item It has variance $p (1 - p) / n$. Note that the largest value that $p (1 - p)$ can
take is $1/4$ at $p = 1/2$.
\item $Z = \frac{\hat p - p}{\sqrt{p (1 - p) / n}}$ follows a standard normal distribution
for large $n$ by the CLT. The convergence to normality is fastest when $p = .5$.
\end{enumerate}
% \item The {\bf Wald test} for $H_0: p = p_0$ versus one of $H_1: p < p_0$, $H_2: p = p_0$, and
% $H_3: p > p_0$ uses the test statistic
% $$
% TS = \frac{\hat p - p}{\sqrt{\hat p (1 - \hat p) / n}}
% $$
% which is compared to standard normal quantiles.
\item The {\bf Wald confidence interval} for a binomial proportion is
$$
\hat p \pm Z_{1 - \alpha / 2} \sqrt{\hat p (1 - \hat p) / n}.
$$
The Wald interval is the interval obtained by inverting the Wald test (and vice versa).
% \item The {\bf Score test} for a binomial proportion is
% $$
% ts = \frac{\hat p - p}{\sqrt{p_0 (1 - p_0) / n}}.
% $$
% The score test has better finite sample performance than the Wald test.
\item The {\bf Score interval} is obtained by inverting a score test
\begin{eqnarray*}
& \hat p \left(\frac{n}{n + Z_{1 - \alpha / 2}^2}\right) +
\frac{1}{2} \left(\frac{Z_{1 - \alpha / 2}^2}{n + Z_{1 - \alpha / 2}^2}\right) \\
& \pm Z_{1 - \alpha/2}\sqrt{\frac{1}{n + Z_{1 - \alpha / 2}^2}
\left[\hat p (1 - \hat p) \left(\frac{n}{n + Z_{1 - \alpha / 2}^2}\right) +
\frac{1}{4} \left(\frac{Z_{1 - \alpha / 2}^2}{n + Z_{1 - \alpha / 2}^2}\right)
\right]}.
\end{eqnarray*}
\item An approximate score interval for $\alpha = .05$ can be obtained by taking
$\tilde p = \frac{X + 2}{n + 4}$ and calculating the Wald interval using $\tilde p$
instead of $\hat p$ and $\tilde n$ instead of $n$
% \item An exact binomial test for $H_3$ can be performed by calculating the exact P-value
% $$
% P(X \geq x_{obs}| p = p_0) = \sum_{k = x_{obs}}^n \left(\begin{array}{c} n \\ k \end{array}\right)p_0^k (1 - p_0)^{n - k}.
% $$
% where $x_{obs}$ is the observed success count. For $H_1$ the
% corresponding exact P-value is
% $$
% P(X \leq x_{obs}| p = p_0) =
% \sum_{k = 0}^{ x_{obs}} \left(\begin{array}{c} n \\ k \end{array}\right)p_0^k (1 - p_0)^{n - k}.
% $$
% These confidence intervals are {\bf exact}, which means that the
% actual type one error rate is {\em no larger than} $\alpha$. (The
% actual type one error rate is generally smaller than $\alpha$.)
% Therefore these tests are {\bf conservative}. For $H_2$, calculate
% the appropriate one sided P-value and double it.
% \item Occasionally, someone will try to convince you to obtain an
% exact Type I error rate using supplemental randomization. Ignore
% them.
% \item Inverting the exact test, choosing those value of $p_0$ for
% which we fail to reject $H_0$, yields an exact confidence interval.
% This interval has to be calculated numerically. The coverage of the
% exact binomial interval is no lower than $100(1 - \alpha)\%$.
\end{enumerate}
\section{The likelihood for a binomial parameter $p$}
\begin{enumerate}[1.]
\item The {\bf likelihood} for a parameter is the density {\em viewed
as a function of the parameter}.
\item The binomial likelihood for observed data $x$ is
$p^x (1 - p)^{n - x}$. It is standard to drop constants in the parameter from
the likelihood (such as the $n$ choose $x$ part).
\item The {\bf principle of maximum likelihood} states that a good estimate of
the parameter is the one that makes the data that was actually observed most
probable. That is, the principle of maximum likelihood says that a good estimate
of the parameter is the one that maximizes the likelihood.
\begin{enumerate}[a.]
\item The maximum likelihood estimate for $p$ is $\hat p = X/n$.
\item The maximum likelihood estimate for $\mu$ for iid $\mathrm{N}(\mu, \sigma^2)$ data
is $\bar X$. The maximum likelihood estimate for $\sigma^2$ is $(n - 1) S^2 / n$ (the
biased sample variance).
\end{enumerate}
\item The {\bf law of the likelihood} states that {\bf likelihood
ratios} represent the relative evidence comparing one hypothesized
value of the parameter to another.
\item Likelihoods are usually plotted so that the maximum value (the
value at the ML estimate) is 1. Where reference lines at 1/8 and
1/32 intersect the likelihood depict {\bf likelihood intervals}.
Points lying within the 1/8 reference line, for example, are such
that no other parameter value is more than 8 times better supported
given the data.
\end{enumerate}
\section{Group comparisons}
\begin{enumerate}[1.]
\item For group comparisons, make sure to differentiate whether or not the
observations are paired (or matched) versus independent.
\item For paired comparisons for continuous data, one strategy is to
calculate the {\bf differences} and use the methods for testing and
performing hypotheses regarding a single mean. The resulting tests
and confidence intervals are called {\bf paired Student's} $T$ tests
and intervals respectively.
\item For independent groups of iid variables, say $X_i$ and $Y_i$,
{\em with a constant variance} $\sigma^2$ across groups
$$
Z = \frac{\bar X - \bar Y - (\mu_x - \mu_y)}{S_p \sqrt{\frac{1}{n_x} +
\frac{1}{n_y}}}
$$
limits to a standard normal random variable as both $n_x$ and
$n_y$ get large. Here
$$S_p^2 = \frac{(n_x - 1) S_x^2 + (n_y - 1) S_y^2}{n_x + n_y - 2}$$
is the {\bf pooled estimate} of the variance. Obviously, $\bar X$,
$S_x$, $n_x$ are the sample mean, sample standard deviation and
sample size for the $X_i$ and $\bar Y$, $S_y$ and $n_y$ are defined
analogously.
\item If the $X_i$ and $Y_i$ happen to be normal, then $Z$ follows the
Student's $T$ distribution with $n_x + n_y - 2$ degrees of freedom.
\item Therefore a $(1 - \alpha)\times 100\%$ confidence interval for
$\mu_y - \mu_x$ is
$$
\bar Y - \bar X \pm t_{n_x + n_y - 2, 1 - \alpha/2}S_p\left(\frac{1}{n_x} + \frac{1}{n_y}\right)^{1/2}
$$
\item Exactly as before,
$$
\frac{\bar Y - \bar X}{S_p \left(\frac{1}{n_x} + \frac{1}{n_y}\right)^{1/2}}
$$
follows a non-central $T$ distribution with non-centrality parameter
$\frac{\mu_y - \mu_x}{\sigma \left(\frac{1}{n_x} +
\frac{1}{n_y}\right)^{1/2}}$. Therefore, we can use this statistic
to create a likelihood for $(\mu_y - \mu_x) / \sigma$, a standardized
measure of the change in group means
\item Note that under unequal variances
$$
\bar Y - \bar X \sim N\left(\mu_y - \mu_x, \frac{\sigma_x^2}{n_x} + \frac{\sigma_y^2}{n_y}\right)
$$
\item The statistic
$$
\frac{\bar Y - \bar X - (\mu_y - \mu_x)}{\left(\frac{\sigma_x^21}{n_x} + \frac{\sigma_y^2}{n_y}\right)^{1/2}}
$$
approximately follows Gosset's $T$ distribution with degrees of freedom equal to
$$
\frac{\left(S_x^2 / n_x + S_y^2/n_y\right)^2}
{\left(\frac{S_x^2}{n_x}\right)^2 / (n_x - 1) +
\left(\frac{S_y^2}{n_y}\right)^2 / (n_y - 1)}
$$
% \item The test statistic
% $
% TS = \frac{\bar X - \bar Y}{S_p \sqrt{\frac{1}{n_x} +
% \frac{1}{n_y}}}
% $
% can be used to test the hypothesis that $H_0: \mu_x = \mu_y$ versus
% the alternatives $H_1: \mu_x < \mu_y$, $H_2: \mu_x \neq \mu_y$ and
% $H_3:\mu_x > \mu_y$. The test statistic should be compared to Student's
% $T$ quantiles with $n_x + n_y - 2$ df.
% \item $\frac{S_x^2/\sigma_x^2}{S_y^2/\sigma_y^2}$ follows what is
% called the $F$ distribution with $n_x - 1$ {\bf numerator degrees of
% freedom} and $n_y - 1$ denominator degrees of freedom.
% \item To test the hypothesis $H_0: \sigma_x^2 = \sigma_y^2$ versus th
% hypotheses $H_1 : \sigma_x^2 < \sigma_y^2$, $H_2 : \sigma_x^2 \neq \sigma_y^2$
% and $H_3 : \sigma_x^2 > \sigma_y^2$ compare the statistic $TS = S_1^2 / S_2^2$
% to the $F$ distribution. We reject $H_0$ if:
% \begin{enumerate}[$H_1$]
% \item if $TS < F_{n_x - 1, n_y - 1, \alpha}$,
% \item if $TS < F_{n_x - 1, n_y - 1, \alpha / 2}$ or $TS > F_{n_x - 1, n_y - 1, 1 - \alpha / 2}$,
% \item if $TS > F_{n_x - 1, n_y - 1, 1 - \alpha}$.
% \end{enumerate}
% \item The F distribution satisfies the property that $F_{n_x - 1, n_y - 1, \alpha} =
% F_{n_y - 1, n_x - 1, 1 - \alpha}$. So that, it turns out, that our results are consistent
% whether we put $S_x^2$ on the top or bottom.
% \item Using the fact that
% $$
% 1 - \alpha =
% P\left(F_{n_x - 1, n_y - 1, \alpha/2} \leq \frac{S_x^2/\sigma_x^2}{S_y^2/\sigma_y^2}
% \leq F_{n_x - 1, n_y - 1, 1 - \alpha / 2}\right)
% $$
% we can calculate a confidence interval for $\frac{\sigma_y^2}{\sigma_x^2}$ as
% $
% \left[F_{n_x - 1, n_y - 1, \alpha}\frac{S_x^2}{S_y^2},
% F_{n_x - 1, n_y - 1, 1-\alpha/2}\frac{S_x^2}{S_y^2}\right].
% $
% Of course, the confidence interval for $\frac{\sigma_x^2}{\sigma_y^2}$ is
% $
% \left[F_{n_y - 1, n_x - 1, \alpha}\frac{S_y^2}{S_x^2},
% F_{n_y - 1, n_x - 1, 1-\alpha/2}\frac{S_y^2}{S_x^2}\right].
% $
% \item F tests are not robust to the normality assumption.
% \item The statistic
% $$
% \frac{\bar X - \bar Y - (\mu_x - \mu_y)}{\sqrt{\frac{S_x^2}{n_x} + \frac{S_y^2}{n_y}}}
% $$
% follows a standard normal distribution for large $n_x$ and $n_y$.
% It follows an approximate Students $T$ distribution if the $X_i$ and
% $Y_i$ are normally distributed. The degrees of freedom are given below.
% \item For testing $H_0 : \mu_x = \mu_y$ in the event where there is
% evidence to suggest that $\sigma_x \neq \sigma_y$, the test statistic
% $
% TS = \frac{\bar X - \bar Y}{\sqrt{\frac{S_x^2}{n_x} + \frac{S_y^2}{n_y}}}
% $
% follows an approximate Student's $T$ distribution under the null hypothesis
% when $X_i$ and
% $Y_i$ are normally distributed. The degrees of freedom are approximated with
% $$
% \frac{(S_x^2 / n_x + S_y^2 / n_y)^2}{(S_x^2 / n_x)^2 / (n_x - 1) + (S_y^2 / n_y)^2 / (n_y - 1)}.
% $$
% \item The power for a $Z$ test of $H_0:\mu_x = \mu_y$ versus $H_3:\mu_x > \mu_y$ is
% given by
% $$
% P\left(Z \geq Z_{1 - \alpha} - \frac{\mu_x - \mu_y}{\sqrt{\frac{\sigma_x^2}{n_x}+\frac{\sigma_y^2}{n_y}}}\right)
% $$
% while for $H_1:\mu_x < \mu_y$ it is
% $$
% P\left(Z \leq -Z_{1 - \alpha} - \frac{\mu_x - \mu_y}{\sqrt{\frac{\sigma_x^2}{n_x}+\frac{\sigma_y^2}{n_y}}}\right).
% $$
% \item Sample size calculation assuming $n_x = n_y = n$
% $$
% n = \frac{(Z_{1 - \alpha} + Z_{1 - \beta}) ^ 2 (\sigma_x ^2 + \sigma_y ^ 2)}{(\mu_x - \mu_y)^2}.
% $$
\end{enumerate}
\end{document}