-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathTheOffice.R
163 lines (142 loc) · 5.25 KB
/
TheOffice.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# Setup----
library(schrute)
library(tidyverse)
library(tidytext)
library(viridis)
library(ggrepel)
# IMDB rating data
office_ratings <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-03-17/office_ratings.csv')
office_ratings$season <- sprintf("%02d",office_ratings$season)
office_ratings$episode <- sprintf("%02d",office_ratings$episode)
# Transcript data
mydata <- schrute::theoffice
# Remove "" from the character column
mydata$character <- gsub("\"","",mydata$character)
# Tokenize the data (individual words)
token.mydata <- mydata %>%
unnest_tokens(word, text)
# Remove frequent words (the, it, etc.)
stop_words <- stop_words
tidy.token.mydata <- token.mydata %>%
anti_join(stop_words, by = "word")
# Word frequency----
# Calculate word frequency (above 400)
word.freq <- tidy.token.mydata %>%
count(word, sort = TRUE) %>%
filter(n>400)
# Plot most frequent words
#png("wordfreq.png",units="in",width=6,height=6,res=300)
#print (
ggplot(word.freq, aes(x=reorder(word,n),y=n)) +
geom_bar(stat="identity") + coord_flip() +
labs(x = "Word", y ="Frequency of Word" ,title ="Most Frequently Used Words")
#)
#dev.off()
# Episode ratings----
# Plot IMDB ratings over episodes
#png("rating.png",units="in",width=5,height=7,res=300)
#print (
ggplot(office_ratings, aes(season, episode, fill = imdb_rating)) +
geom_tile(colour = "white") +
scale_fill_viridis(option="magma") +
labs(x = "Season", y ="Episode" ,title ="IMDB Ratings of The Office",fill="Rating")
#)
#dev.off()
# Word sentiment----
# Calculate sentiment of words based on the Bing lexicon
office.sentiment <- tidy.token.mydata %>%
inner_join(get_sentiments("bing")) %>%
count(season, index = episode, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
colnames(office.sentiment)[2] <- "episode"
# Combine sentiment and rating data
ratings.sentiment <- merge(office.sentiment, office_ratings, by = c("season","episode"))
# Plot sentiment across seasons
#png("sentiment.png",units="in",width=7,height=5,res=300)
#print (
ggplot(office.sentiment, aes(sentiment)) +
geom_density(aes(color=season)) +
scale_color_viridis(option="magma",discrete = TRUE) +
labs(x = "Mean sentiment rating across season", y ="Density", title ="Sentiment Ratings Across Seasons", color="Season")
#)
#dev.off()
# Plot sentiment vs. IMDB rating
#png("ratingsentiment.png",units="in",width=7,height=5,res=300)
#print (
ggplot(ratings.sentiment, aes(x=sentiment,y=imdb_rating,group=season,color=season)) +
geom_jitter() +
scale_color_viridis(option="magma",discrete = TRUE) +
labs(x = "Sentiment rating", y ="IMDB Rating" ,title ="Sentiment vs. IMDB Rating",color="Season") +
geom_label_repel(aes(label = title),
data = subset(ratings.sentiment, (sentiment <= -50 | sentiment >= 35)),
box.padding = 0.35,
point.padding = 0.5,
segment.color = 'grey50')
#)
#dev.off()
# Characters line frequency----
# Find Top 20 characters by number of lines
characters.top20 <- tidy.token.mydata %>%
count(character, sort = TRUE) %>%
top_n(20) %>%
pull(character)
# Calculate number of lines per character
characters <- mydata %>%
group_by(season, episode) %>%
ungroup() %>%
group_by(character) %>%
count(sort=TRUE)
# Plot most-speaking characters
#png("charfreq.png",units="in",width=6,height=6,res=300)
#print (
ggplot(subset(characters, character %in% characters.top20),aes(x=reorder(character,n),y=n)) +
geom_bar(stat="identity") +
coord_flip() +
labs(x = "Character", y ="Number of Lines" ,title ="Number of Lines by Character")
#)
#dev.off()
# Character sentiment----
# Calculate average sentiment for each character
character.sentiment <- tidy.token.mydata %>%
inner_join(get_sentiments("bing")) %>%
group_by(character) %>%
filter(is.element(character,characters.top20)) %>%
count(character, index = season, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
# Plot sentiment for each character (average per season)
#png("charsentiment.png",units="in",width=6,height=6,res=300)
#print (
ggplot(character.sentiment, aes(x=reorder(character,sentiment),y=sentiment)) +
geom_boxplot() +
coord_flip() +
labs(x = "Character", y ="Sentiment Rating" ,title ="Character Sentiments")
#)
#dev.off()
# Angela's love life
# Calculate number of times the State Senator is mentioned
angelas.senator <- tidy.token.mydata %>%
filter(str_detect(word, "senator")) %>%
count(character)
# Plot who talks about the State Senator
#png("senator.png",units="in",width=6,height=6,res=300)
#print (
ggplot(angelas.senator,aes(x=reorder(character,n),y=n)) +
geom_bar(stat="identity") +
coord_flip() +
labs(x = "Character", y ="Number of Senator Mentions" ,title ="Who talks about the state senator?")
#)
#dev.off()
# Calculate who Angela talks about: the State Senator or Dwight
angela.love <- tidy.token.mydata %>%
subset(character=="Angela") %>%
filter(str_detect(word, "dwight$|senator$"))
# Plot who Angela talks about
#png("angelalove.png",units="in",width=6,height=6,res=300)
#print (
ggplot(angela.love, aes(word)) +
geom_bar() +
labs(x = "Angela's Lover", y ="Number of Mentions" ,title ="Who does Angela talk about?")
#)
#dev.off()