forked from msr-ds3/coursework
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsampling_means_HW.Rmd
109 lines (79 loc) · 3.09 KB
/
sampling_means_HW.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
---
title: "Coefficients and bias"
author: "Jacob LaRiviere"
date: "June 18, 2017, 2017"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
## R Markdown
This is an R Markdown document. For more details on using R Markdown see <http://rmarkdown.rstudio.com>.
When you click the **Knit** button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document.
```{r}
library(dplyr)
library(plyr)
library(tidyr)
library(ggplot2)
library(reshape2)
#setwd("C:/Users/jlariv/OneDrive/Econ 404/")
#mydata <- read.csv("oj.csv")
#colnames(mydata)[1-17]
X <- NULL
# Create a variable n which is the number of observations we'll look at.
n = 1000
#The variable p will be the width of that variable.
p = 2
#rnorm generates a random draw from a standard normal distribution.
X = matrix(rnorm(n*p), n, p)
# This lets us look at the data
head(X)
# Histogram of the first column. We can see that it looks like a bell curve
hist(X[,1])
#NOTE: if you wanted to draw from something with a non-zero mean and more interesting variance that's: rnorm(n*p)*st_dev + mu
st_dev <- 2
mu <- 50
X[,1] <- X[,1]*st_dev + mu
X <-data.frame(X)
ggplot(X,aes(X[,1])) + geom_histogram(binwidth=1)
# Base R version: hist(X[,1])
# So Jacob made this data up. We know that the mean is 50. There is a really nice function in R called "sample" which lets us take a random sub sample of our data say I'd like to take a sample of 25 observations
obs = 25
total_samps = 100
#sample_means = matrix(0, total_samps, 1)
sample_means = rep(0,total_samps)
for (i in 1:total_samps) {
subsample <- c(sample(X[,1], obs))
sample_means[i] = mean(subsample)
}
sample_means <-data.frame(sample_means)
ggplot(sample_means,aes(sample_means)) + geom_histogram(binwidth=.1)
st_err <- st_dev/(obs^.5)
#hist(sample_means)
summary(sample_means)
#Create a vector which increments up from an initial value to 100 by one
obs_0 = 3
obs_max = 100
obs_vec <- seq(obs_0, obs_max, 1)
obs_count <- length(obs_vec)
#Create dataframe to store our observations
sim_power = matrix(0, obs_count, 3)
sim_power[,1]<-obs_vec
sim_power <- data.frame(sim_power)
colnames(sim_power) <- c("Number_Obs","mean","st_error")
#Reset sample_means
sample_means <- NULL
sample_means = rep(0,total_samps)
for (j in 1:obs_count) {
for (i in 1:total_samps) {
subsample <- c(sample(X[,1], sim_power[j,1]))
sample_means[i] = mean(subsample)
}
sim_power[j,2] <- mean(sample_means)
sim_power[j,3] <- st_dev/(sim_power[j,1]^.5)
}
ggplot(sim_power, aes(Number_Obs)) + geom_line(aes(y=mean),colour = "blue")
ggplot(sim_power, aes(Number_Obs)) + geom_line(aes(y=st_error),colour = "red")
# Question: Take a normal distribution with mu = 10 and variance of 9 (e.g., standard deviation of 3). Simulate a population of 10,000. What is the sample size needed to make the standard error of the sample mean sufficiently small so that no more than 5% of the sample means are less 9?
```
Note that the `echo = FALSE` parameter was added to the code chunk to prevent printing of the R code that generated the plot.