-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathMultipleVariables.R
127 lines (102 loc) · 5.71 KB
/
MultipleVariables.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
## ----global_options, include = FALSE-------------------------------------------------------------------------------------------------------------------
try(source("../.Rprofile"))
## ------------------------------------------------------------------------------------------------------------------------------------------------------
# For Data Manipulations
library(tidyverse)
# For Additional table output
# install.packages("knitr")
library(knitr)
## ------------------------------------------------------------------------------------------------------------------------------------------------------
# Load the dataset using readr's read_csv
df_survey <- read_csv('data/classsurvey.csv')
## ------------------------------------------------------------------------------------------------------------------------------------------------------
# We have several factor variables, we can set them as factor one by one
df_survey[['gender']] <- as.factor(df_survey[['gender']])
# But that is a little cumbersome, we can using lapply, a core function in r to do this for all factors
factor_col_names <- c('gender', 'major', 'commute', 'games.any', 'econ')
df_survey[factor_col_names] <- lapply(df_survey[factor_col_names], as.factor)
# Check Variable Types
str(df_survey)
## ------------------------------------------------------------------------------------------------------------------------------------------------------
# We can draw a scatter plot for two continuous variables
# Control Graph Size
options(repr.plot.width = 4, repr.plot.height = 4)
# Draw Scatter Plot
# 1. specify x and y
# 2. label each individual by their ID, add letter I in front of value
# 3. add in trend line
scatter <- ggplot(df_survey, aes(x=games.attended, y=years.in.houston)) +
geom_point(size=1) +
geom_text(aes(label=paste0('I', ID)), size=3, hjust=-.2, vjust=-.2) +
geom_smooth(method=lm) + # Trend line
labs(title = paste0('Scatter Plot of Two Continuous/Quantitative Variables'
,'\nIn Class Survey of 10 Students'),
x = 'Games Attended at the University',
y = 'Years Spent in the City of Houston',
caption = 'In Class Survey') +
theme_bw()
print(scatter)
## ------------------------------------------------------------------------------------------------------------------------------------------------------
# We can tabulate Frequencies based on two categorical variables
df_survey %>%
group_by(gender, econ) %>%
summarize(freq = n()) %>%
spread(gender, freq)
## ------------------------------------------------------------------------------------------------------------------------------------------------------
# We can show the fraction of individuals in each of the four groups
df_survey %>%
group_by(interaction(gender, econ)) %>%
summarise(freq = n()) %>%
mutate(fraction = freq / sum(freq))
## ------------------------------------------------------------------------------------------------------------------------------------------------------
# We can create stacked bar charts as well with the same data
# graph size
options(repr.plot.width = 3, repr.plot.height = 2)
# Graph
stacked.bar.plot <- ggplot(df_survey) +
geom_bar(aes(x=gender, fill=econ)) +
theme_bw()
print(stacked.bar.plot)
## ------------------------------------------------------------------------------------------------------------------------------------------------------
# We can first find the group averages
df_gender_avg_games <- df_survey %>%
group_by(gender) %>%
summarise (avg.games.attended = mean(games.attended))
df_gender_avg_games
## ------------------------------------------------------------------------------------------------------------------------------------------------------
# We can graph based on df_gender_avg_games
# Sizing the Figure Here
options(repr.plot.width = 2, repr.plot.height = 2)
# Plot, stat = identity means to plot the value in avg.games.attended for each gender
group.means <- ggplot(df_gender_avg_games) +
geom_bar(aes(x=gender, y=avg.games.attended), stat = 'identity') +
theme_bw()
print(group.means)
## ------------------------------------------------------------------------------------------------------------------------------------------------------
# But it is a little cumbersome to do this in two steps, we can do it in one step
# Sizing the Figure Here
options(repr.plot.width = 2, repr.plot.height = 2)
# Plot directly from df_survey, summary over x for y
# The result looks the same
group.means.joint <- ggplot(df_survey) +
geom_bar(aes(x=gender, y=games.attended), stat = "summary", fun.y = "mean") +
theme_bw()
print(group.means.joint)
## ------------------------------------------------------------------------------------------------------------------------------------------------------
# We can calculate the statistics as a table, and also show obs in each group
df_survey %>%
group_by(gender, econ ) %>%
summarise (avg.games.attended = mean(games.attended), N.count = n())
## ------------------------------------------------------------------------------------------------------------------------------------------------------
# Let's Show these Visually
options(repr.plot.width = 4, repr.plot.height = 2)
# Plot directly from df_survey
# Using fill for econ, this means econ or not will fill up with different colors
# Still caculate average
# Postion "dodge" means that econ and non-econ wil be shown next to each other
# By default position is to stack different fill groups on top of each other.
two.group.means <- ggplot(df_survey) +
geom_bar(aes(x=gender, y=games.attended, fill=econ),
stat = "summary", fun.y = "mean", position = "dodge") +
theme_bw()
print(two.group.means)