-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocessing-1.R
221 lines (197 loc) · 7.23 KB
/
preprocessing-1.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
library(ape)
library(dplyr)
library(ggpattern)
library(ggplot2)
library(reshape2)
library(stringr)
library(lubridate)
library(xml2)
## To keep hard-coding to a minimum I am using a config.xml to store
## file names. Define all the files that are either used or created by
## this script see the configuration XML for these values and a short
## description of what the files contain.
config <- as_list(read_xml("config.xml"))
time_series_input_csv <- config$files$data$timeSeries[[1]]
time_series_clean_csv <- config$files$results$intermediate$timeSeries[[1]]
li_nexus <- config$files$data$sequences[[1]]
data_plot_png <- config$files$results$figures$dataPlot[[1]]
data_plot_rds <- str_replace(data_plot_png, ".png", ".rds")
session_rdata_out <-
sprintf("out/preprocessing-1-workspace-%s.RData", Sys.Date())
stopifnot(file.exists(time_series_clean_csv))
stopifnot(file.exists(li_nexus))
## Read in the data from WPD and munged into a nice data.frame. By
## default WPD labels the columns of a bar-chart "BAR0", "BAR1", ...
## and the weeks are labelled 1, 2, ... so these are parsed and used
## to create a column of the first day of each week. Based on the
## timing of the vaccination rounds, it looks like week 18 covers the
## dates 4--8 May, so we have shifted the epidemiological weeks to
## match this.
##
## To account for the non-integer values exported by WPD we just round
## to the nearest integer. The total case count matches the reported
## number of cases so it seems like this has been accurate.
epi_week_start_date <- ymd("2010-01-01") - days(3)
who_df <-
read.csv(time_series_input_csv, header = FALSE) |>
rename(week_zrd = V1, cases = V2) |>
mutate(week_zrd = as.integer(gsub(x = week_zrd,
pattern = "Bar",
replacement = "")) + 1,
week_date = epi_week_start_date + week_zrd * days(7),
cases = round(cases))
## 4 May is a Tuesday, \code{weekdays(ymd("2010-05-04")) == "Tuesday"}
## so we can use this to add columns for the Sunday and Saturday of
## each week to the data frame above. The weekend columns are useful
## for plotting the sequence dates later.
who_df$week_start_date <- who_df$week_date - days(2)
who_df$week_end_date <- who_df$week_date + days(4)
## It is useful to have the dates of the vaccination rounds in a data
## frame so we can plot them later.
interval_df <-
data.frame(
start_date = ymd(c("2010-05-04",
"2010-05-18",
"2010-06-01",
"2010-06-15",
"2010-10-04",
"2010-11-08",
"2010-09-13")),
end_date = ymd(c("2010-05-08",
"2010-05-22",
"2010-06-05",
"2010-06-19",
"2010-10-08",
"2010-11-12",
"2010-09-17")),
label = c("Round 1",
"Round 2",
"Round 3",
"Round 4",
"Round 5",
"Round 6",
"Mop-up"),
y = c(80, 80, 80, 80, 80, 80, 80)
)
## And if we want to include the sequenced samples as a bar with
## different cross hatching, we will need to extract the weekly
## sequence counts from the data as well.
seqs <- ape::read.nexus.data(file = li_nexus)
seq_dates <-
seqs |>
names() |>
str_extract("[0-9]{4}-[0-9]{2}-[0-9]{2}") |>
purrr::discard(.p = is.na) |>
ymd()
seq_df <-
who_df |>
select(week_date, week_start_date, week_end_date) |>
mutate(seq_count = 0)
for (ix in seq_len(nrow(seq_df))) {
week_start <- seq_df[ix, "week_start_date"]
week_end <- seq_df[ix, "week_end_date"]
seq_df[ix, "seq_count"] <-
sum(seq_dates >= week_start & seq_dates <= week_end)
}
## It appears that the sequences don't match up nicely with the number
## of cases. Quite possibly there is a difference between the date
## associated with the case and the date associated with the sequence.
## To avoid double counting cases, I have used the cases minus the
## number of sequences associated with that week, and for the two
## weeks in which this is negative (a value of -1) I have set the
## value to 0. This should only make a minor change to the data and
## should have a far smaller effect than over counting the cases.
##
## As part of testing the sensitivity to model assumptions, we re-ran
## this without subtracting the sequences from time series. The
## commented mutate command in this pipeline will do that and the only
## thing that needs to be changed is the resulting disaster sizes in
## the XML.
plt_df <-
who_df |>
inner_join(seq_df, by = c("week_date")) |>
mutate(cases_minus_seqs = pmax(0, cases - seq_count),
week_start_date = week_start_date.x,
week_end_date = week_end_date.x) |>
##
## NOTE If you want to run the sensitivity analysis you need to swap
## these mutate commands between the commented and uncommented
## versions!!!
##
## mutate(cases_minus_seqs = cases,
## week_start_date = week_start_date.x,
## week_end_date = week_end_date.x) |>
select(week_date,
week_start_date,
week_end_date,
cases_minus_seqs,
seq_count) |>
melt(id.vars = c("week_date",
"week_start_date",
"week_end_date"),
variable.name = "data_type",
value.name = "count") |>
mutate(data_type = factor(data_type,
levels = c("cases_minus_seqs", "seq_count"),
labels = c("Cases", "Sequences")))
## We will need the case data for subsequent analysis so we will save
## while we already have it in a useful format.
write.table(x = plt_df,
file = time_series_clean_csv,
sep = ",",
row.names = FALSE)
## It is useful to have a plot which displays this data along with the
## various times at which interventions where enacted.
outbreak_gg <-
ggplot() +
geom_col_pattern(
data = plt_df,
mapping = aes(x = week_date,
y = count,
pattern = data_type),
fill = "white",
pattern_spacing = 0.015, pattern_angle = 45
) +
scale_pattern_manual(
values = c("stripe", "crosshatch")
) +
geom_point(
data = interval_df,
mapping = aes(x = start_date, y = y),
shape = 6
) +
geom_text(
data = interval_df,
mapping = aes(x = start_date, y = y, label = label),
hjust = 0,
vjust = -1,
angle = 30,
size = 3
) +
scale_x_date(
date_breaks = "2 month",
date_labels = "%b %d",
limits = c(ymd("2010-01-01"), ymd("2010-08-01")),
expand = c(0, 0)
) +
scale_y_continuous(
limits = c(0, 90),
breaks = seq(0, 100, 20),
expand = c(0, 2)
) +
labs(x = NULL, y = NULL, pattern = NULL, pattern_angle = NULL) +
theme_bw() +
theme(
legend.position = c(0.2, 0.3)
)
## save a copy of the data plot object so we can re-use it later.
saveRDS(object = outbreak_gg,
file = data_plot_rds)
ggsave(filename = data_plot_png,
plot = outbreak_gg,
height = 0.7 * 14.8, width = 21.0,
units = "cm")
## It is useful to save a copy of the whole workspace so that if we
## need to come back to edit the plot we can do so without having to
## re-run the whole analysis.
save.image(file = session_rdata_out)