-
Notifications
You must be signed in to change notification settings - Fork 1
/
Preferential Attachment Analysis.Rmd
245 lines (185 loc) · 8.57 KB
/
Preferential Attachment Analysis.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
----
title: "Preferential Attachment Analysis"
output: html_document
---
## Packages
```{r message=FALSE, warning=FALSE}
library(compiler)
library(dplyr)
library(tidygraph)
library(ggraph)
library(tidyverse)
library(sna)
library(igraph)
library(visNetwork)
library(networkD3)
library(zeallot)
```
## Import network data
The dataset has two dataframes that we can use to get the information about nodes and the information about edges. We rename the columns of the matchup dataframe to fit the edges definition and we have an id to the lineup dataframe to refer to lineup players by their node id.
The original dataset of matchups has lineups numbered from zero, but tidygraph (and most packages in R) have indices that start in 1. For that reason we add 1 to the entries of the edges table and also to lineup table.
```{r message=FALSE, warning=FALSE}
lineup <- read_csv("lineups.csv")
lineup$id <- lineup$id +1
names(lineup) <- c('id','label','team')
matchups <- read_csv("matchups.csv")
matchup <- matchups[,c("home_players","away_players","type","weight","home_team","away_team","time")]
names(matchup) <- c('from','to','weight','score',"home_team","away_team","time")
matchup$to <- matchup$to+1
matchup$from <- matchup$from+1
matchup <- matchup[!(matchup$weight==0),]
#consider time effect
matchup$score_adjusted <- matchup$score/matchup$time
matchup$weight <- factor(matchup$weight)
matchup$RowID <- seq.int(nrow(matchup))
print(paste("# Lineups: ",nrow(lineup)))
print(paste("# Matchups: ",nrow(matchups)))
print(paste("# Matchups after filtering: ",nrow(matchup),"(differences:",nrow(matchups)-nrow(matchup),")"))
```
## Preferential Attachment for Home/away team lineups
### Home team lineups analysis
Order the matchup table by the id of Home team lineup and treat the time steps as the times that a lineup showed up as a home team. For example, if lineup A showed up as Home team at the first time step, we treat this matchup as if it happened at t_0. Then the second time corresponds to t_1.
```{r message=FALSE, warning=FALSE}
# pre-analysis
home_matchup <- matchup[order(matchup$from),] %>%
group_by(from) %>%
mutate(time_step = 1:n()) %>% ungroup()
head(home_matchup)
# Histogram of counts
hist(count(home_matchup,from)[count(home_matchup,from)$n>1,]$n,xlim=c(0,50),freq = T,main="Histgram of counts",xlab="counts",breaks=150)
count(home_matchup,from) %>% ggplot(aes(x = from, y = n)) + geom_col()
```
Add positive outdegree at each time_step for each home lineups. Save results to an RData file.
```{r message=FALSE, warning=FALSE}
home_counts <- count(home_matchup,from)
home_matchup_degree <- home_matchup[0,] %>% add_column(deg_pout = NA)
for (i in c(1:nrow(home_counts))) {
id <- home_counts$from[[i]]
temp <- home_matchup[home_matchup$from == id, ] %>% add_column(deg_pout = NA)
temp$weight <- as.numeric(temp$weight)
temp$weight[temp$weight == 1] <- 0
temp$weight[temp$weight == 2] <- 1
for (j in c(1:nrow(temp))){
temp[j,"deg_pout"] <- sum(temp$weight[1:j])
}
home_matchup_degree <- rbind(home_matchup_degree,temp)
}
save(home_matchup_degree, file = "RData/home_matchup_degree.RData")
```
Calculate the probability of a home lineup win at the current time step if they have positive outdegree = past_deg.
```{r message=FALSE, warning=FALSE}
load("RData/home_matchup_degree.RData")
attraction_home <- function(past, now, past_deg,now_deg){
data <- home_matchup_degree %>%
inner_join(home_matchup_degree %>% filter(time_step==past) %>%select(from)) %>%
inner_join(home_matchup_degree %>% filter(time_step==now) %>% select(from)) %>%
filter(time_step == past | time_step == now)
data_denominator <- data %>% filter(deg_pout ==past_deg & time_step==past)
data_numerator <- data %>% filter(deg_pout== now_deg & time_step==now) %>%
inner_join(data_denominator %>% select(from))
return(list(nrow(data_numerator),nrow(data_denominator)))
}
```
Combine the results in a dataframe.
Takes ~ 2 hours.
```{r message=FALSE, warning=FALSE}
home_counts <- count(home_matchup_degree,from) # count the number of time steps for each node
home_win_prob <- data.frame(past_deg=as.numeric(), now_deg = as.numeric(), win_prob = as.numeric(), n1 =as.numeric(), n2 = as.numeric(), stringsAsFactors=FALSE)
for (i in c(1:(max(home_matchup_degree$deg_pout)-1))){
if (i %% 10 == 0){
print(i)
}
home_win_prob[i, "past_deg"] <- i
home_win_prob[i, "now_deg"] <- i+1
n1 <- 0
n2 <- 0
deg1 <- home_win_prob$past_deg[[i]]
deg2 <- home_win_prob$now_deg[[i]]
for (j in c(2:max(home_counts$n))){
c(a,b) %<-% attraction_home(past = j-1, now = j, past_deg = i,now_deg = i+1)
n1 <- n1 + a
n2 <- n2 + b
}
home_win_prob[i,"win_prob"] <- n1/n2
home_win_prob[i,"n1"] <- n1
home_win_prob[i,"n2"] <- n2
}
save(home_win_prob, file = "RData/home_win_prob.RData")
```
```{r message=FALSE, warning=FALSE}
load("RData/home_win_prob.RData")
reg_home <- lm(win_prob ~ past_deg, data = home_win_prob[home_win_prob$n2>70,])
summary(reg_home)
plot(x=home_win_prob$past_deg,y=home_win_prob$win_prob,main = "Probability of winning with home lineups",xlab = "Past wins", ylab = "Odds of winning")+abline(v=home_win_prob$past_deg[home_win_prob$n2<70][1],col="blue")+abline(reg_home,col="red",lwd=3, lty=2)
```
### Away team lineups analysis
Order the matchup table by the id of Away team lineup and treat the time steps as the times that a lineup showed up as an away team. For example, if lineup A showed up as away team at the first time step, we treat this matchup as if it happened at t_0. Then the second time corresponds to t_1.
```{r message=FALSE, warning=FALSE}
# pre-analysis
away_matchup <- matchup[order(matchup$to),] %>%
group_by(to) %>%
mutate(time_step = 1:n()) %>% ungroup()
head(away_matchup)
count(away_matchup,to) %>% ggplot(aes(x = to, y = n)) + geom_col()
```
Add the positive indegree at each time_step for each away lineup. Save results to an RData file.
```{r message=FALSE, warning=FALSE}
away_counts <- count(away_matchup,to)
away_matchup_degree <- away_matchup[0,] %>% add_column(deg_pin = NA)
for (i in c(1:nrow(away_counts))) {
id <- away_counts$to[[i]]
temp <- away_matchup[away_matchup$to == id, ] %>% add_column(deg_pin = NA)
temp$weight <- as.numeric(temp$weight)
temp$weight[temp$weight == 1] <- 1
temp$weight[temp$weight == 2] <- 0
for (j in c(1:nrow(temp))){
temp[j,"deg_pin"] <- sum(temp$weight[1:j])
}
away_matchup_degree <- rbind(away_matchup_degree,temp)
}
save(away_matchup_degree, file = "RData/away_matchup_degree.RData")
```
Calculate the probability of an away lineup win at current time step if they have a positive indegree = past_deg.
```{r message=FALSE, warning=FALSE}
load("RData/away_matchup_degree.RData")
attraction_away <- function(past, now, past_deg,now_deg){
data <- away_matchup_degree %>%
inner_join(away_matchup_degree %>% filter(time_step==past) %>%select(to)) %>%
inner_join(away_matchup_degree %>% filter(time_step==now) %>% select(to)) %>%
filter(time_step == past | time_step == now)
data_denominator <- data %>% filter(deg_pin ==past_deg & time_step==past)
data_numerator <- data %>% filter(deg_pin== now_deg & time_step==now) %>%
inner_join(data_denominator %>% select(to))
return(list(nrow(data_numerator),nrow(data_denominator)))
}
```
Combine the results in a dataframe.
Takes ~ 2 hours.
```{r message=FALSE, warning=FALSE}
away_counts <- count(away_matchup_degree,to) # count number of time steps for each node
away_win_prob <- data.frame(past_deg=as.numeric(), now_deg = as.numeric(), win_prob = as.numeric(),n1 =as.numeric(), n2 = as.numeric(), stringsAsFactors=FALSE)
for (i in c(1:(max(away_matchup_degree$deg_pin)-1))){
away_win_prob[i, "past_deg"] <- i
away_win_prob[i, "now_deg"] <- i+1
n1 <- 0
n2 <- 0
deg1 <- away_win_prob$past_deg[[i]]
deg2 <- away_win_prob$now_deg[[i]]
for (j in c(2:max(away_counts$n))){
c(a,b) %<-% attraction_away(past = j-1, now = j, past_deg = i,now_deg = i+1)
n1 <- n1 + a
n2 <- n2 + b
}
away_win_prob[i,"win_prob"] <- n1/n2
away_win_prob[i,"n1"] <- n1
away_win_prob[i,"n2"] <- n2
}
save(away_win_prob, file = "RData/away_win_prob.RData")
```
Regression analysis
```{r message=FALSE, warning=FALSE}
load("RData/away_win_prob.RData")
reg_away <- lm(win_prob ~ past_deg, data = away_win_prob[away_win_prob$n2>70,])
summary(reg_away)
plot(x=away_win_prob$past_deg,y=away_win_prob$win_prob,main = "Probability of winning with away lineups",xlab = "Past wins", ylab = "Odds of winning")+abline(v=away_win_prob$past_deg[away_win_prob$n2<70][1],col="blue")+abline(reg_away,col="red",lwd=3, lty=2)
```