-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathplot-tributR_FSA.r
617 lines (515 loc) · 38.8 KB
/
plot-tributR_FSA.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
# This code produces an automated MS Word report based on attribution analyses of C. jejuni and C. coli. It requires as input the inferred ancestries files generated by the bigsdb_attributor package (one each for C. jejuni and C. coli)
# Developed using funding from the Food Standards Agency (FS101013) and the NIHR Health Protection Research Unit in Gastrointestinal Infections
# Written by Melissa Jansen van Rensburg (2018)
#TODO: In py script head id column in ancs output; add date column (filling as per usual)
#TODO: fill site automatically
#TODO: fix human fill problem in attributor
library(ReporteRs)
library(dplyr)
library(reshape2)
library(ggplot2)
library(cowplot)
library(RColorBrewer)
library(zoo)
library(data.table)
library(scales)
##### SET-UP #####
### Provide paths to input files ###
# C. jejuni
cj_input_data = "/Users/MelissaJvR/Desktop/FSA_Final/RInputFiles/FSA_Cj_inferred-ancestry.csv"
# C. coli
cc_input_data = "/Users/MelissaJvR/Desktop/FSA_Final/RInputFiles/FSA_Cc_inferred-ancestry.csv"
# Define colour palette
set_fill_colours = scale_fill_manual(values = c("Cattle" = "#6FBAFF", "Chicken" = "#FFF697", "Environment" = "#A4FF53", "Sheep" = "#B3B3B3", "Turkey" = "#000000", "Duck" = "#FF9E36", "Goose" = "#FF6666", "Pig" = "#ffc0e9", "Wild.bird" = "#803C20", "Other animal" = "#800080", "Dog" = "#008080", "Ruminant" = "#6FBAFF"))
# Alternative qualitative palette
# If this is used, you must ensure that colour mapping is consistent across all graphs (some editing of the script will be required)
# set_fill_colours = scale_fill_brewer(type="qual", palette="Set3", direction=1)
##### FILE PROCESSING #####
# Open C. jejuni file, convert missing data to NA and drop rows without dates
# Note that PubMLST dates are currently in ISO 8601 format (for xx/xx/xxxx use format=format='%d-%m-%Y')
ancs_cj = read.csv(cj_input_data, header=TRUE, check.names=TRUE)
ancs_cj[ancs_cj==""] <- NA
ancs_cj$date <- as.Date(ancs_cj$date)
cj = ancs_cj[!is.na(ancs_cj$date),]
# Repeat for C. coli
ancs_cc = read.csv(cc_input_data, header=TRUE, check.names=TRUE)
ancs_cc[ancs_cc==""] <- NA
ancs_cc$date <- as.Date(ancs_cc$date)
cc = ancs_cc[!is.na(ancs_cc$date),]
##### DIRECTORY SET-UP #####
# Get current directory and name output directory
# NOTE: the script will fail here if the output directory already exists. Remove/rename the existing directory and try again. If you've deleted the file, make sure that your current working directory isn't set to Trash! To do this, enter "getwd()" and then run the command "setwd(PATH/TO/DESIRED/DIRECTORY)" to move to the right directory.
current_dir = sprintf("%s",getwd())
output_dir = "AttributionReportFSA"
# Create output diretory
dir.create(file.path(current_dir, output_dir))
# Set output directory as current directory
setwd(file.path(current_dir, output_dir))
##### DATA PROCESSING #####
### Isolate counts for original and cleaned datasets ###
# C. jejuni
no_ancs_cj = nrow(ancs_cj)
no_cj = nrow(cj)
cj_missing_dates = no_ancs_cj - no_cj
cj_missing_dates_prop = round((cj_missing_dates/no_ancs_cj*100), digits=2)
all_cj_oxc = sum(ancs_cj$site == 'Oxfordshire')
all_cj_nwc = sum(ancs_cj$site == 'Newcastle')
dated_cj_oxc = sum(cj$site == 'Oxfordshire')
dated_cj_nwc = sum(cj$site == 'Newcastle')
cj_oxc_missing_dates = all_cj_oxc - dated_cj_oxc
cj_nwc_missing_dates = all_cj_nwc - dated_cj_nwc
# C. coli
no_ancs_cc = nrow(ancs_cc)
no_cc = nrow(cc)
cc_missing_dates = no_ancs_cc - no_cc
cc_missing_dates_prop = round((cc_missing_dates/no_ancs_cc*100), digits=2)
all_cc_oxc = sum(ancs_cc$site == 'Oxfordshire')
all_cc_nwc = sum(ancs_cc$site == 'Newcastle')
dated_cc_oxc = sum(cc$site == 'Oxfordshire')
dated_cc_nwc = sum(cc$site == 'Newcastle')
cc_oxc_missing_dates = all_cc_oxc - dated_cc_oxc
cc_nwc_missing_dates = all_cc_nwc - dated_cc_nwc
### Max and min dates ###
# C. jejuni
max_date_cj = format(max(as.Date(cj$date, format="%d/%m/%Y")), "%B %Y")
min_date_cj = format(min(as.Date(cj$date, format="%d/%m/%Y")), "%B %Y")
# C. coli
max_date_cc = format(max(as.Date(cc$date, format="%d/%m/%Y")), "%B %Y")
min_date_cc = format(min(as.Date(cc$date, format="%d/%m/%Y")), "%B %Y")
# Overall
max_date = format(max(as.Date(dplyr::bind_rows(cj, cc)$date, format="%d/%m/%Y")), "%B %Y")
min_date = format(min(as.Date(dplyr::bind_rows(cj, cc)$date, format="%d/%m/%Y")), "%B %Y")
##### OVERALL SUMMARIES #####
### Summary plots and tables for C. jejuni ###
## Site-by-site summaries ##
# Get only source columns for overall summaries
cj_sources <- ancs_cj %>%
select(-matches('id')) %>%
select(-matches('date')) %>%
select(-matches('species'))
# Get proportions for each source, site by site and convert to long form
cj_anc_sites = (aggregate(.~site, cj_sources, mean))
cj_anc_sites_long = melt(cj_anc_sites, id="site", variable.name="Source", value.name="Proportion")
# Create site-by-site table
cj_sbs_summary = setNames(data.frame(t(cj_anc_sites[,-1])), cj_anc_sites[,1])
# Generate grouped bar graph showing site-by-site proportions
cj_sbs_overall_plot = ggplot(cj_anc_sites_long, aes(x = reorder(Source, -Proportion), y = Proportion)) + geom_bar(aes(fill = site), stat="identity", position = position_dodge(width=0.8), width = 0.8) + scale_fill_grey() + theme_grey(base_size = 14) + labs(x="C. jejuni", y="Proportion") + guides(fill=guide_legend(title=NULL)) + theme(legend.position="bottom", legend.direction = "horizontal", axis.text.x = element_text(angle = 45, hjust = 1), axis.title.x = element_text(face="italic"))
## Overall summary ##
# Drop sites to get overall proportions
cj_overall <- cj_sources %>%
select(-matches('site'))
# Get proportions for each source
cj_ancs_long = melt(cj_overall)
cj_overall_summary = melt(tapply(cj_ancs_long$value, cj_ancs_long$variable, mean), varnames="Source", value.name="Proportion")
# Set order: by source, decreasing proportion
cj_overall_summary = cj_overall_summary[order(-cj_overall_summary$Proportion),]
cj_overall_summary$Source = factor(cj_overall_summary$Source, levels=unique(as.character(cj_overall_summary$Source)))
# Generate bar graph showing overall proportions
cj_overall_plot = ggplot(cj_overall_summary) + geom_bar(aes(x=Source, y=Proportion), stat="identity", fill="black") + labs(x="C. jejuni", y="Proportion") + theme_grey(base_size = 14) + theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.title.x = element_text(face="italic"))
### Summary plots and tables for C. coli ###
## Site-by-site summaries ##
cc_sources <- ancs_cc %>%
select(-matches('id')) %>%
select(-matches('date')) %>%
select(-matches('species'))
cc_anc_sites = (aggregate(.~site, cc_sources, mean))
cc_anc_sites_long = melt(cc_anc_sites, id="site", variable.name="Source", value.name="Proportion")
# Site-by-site summary
cc_sbs_summary = setNames(data.frame(t(cc_anc_sites[,-1])), cc_anc_sites[,1])
# Site-by-site plot
cc_sbs_overall_plot = ggplot(cc_anc_sites_long, aes(x = reorder(Source, -Proportion), y = Proportion)) + geom_bar(aes(fill = site), stat="identity", position = position_dodge(width=0.8), width = 0.8) + scale_fill_grey() + theme_grey(base_size = 14) + labs(x="C. coli", y="Proportion") + guides(fill=guide_legend(title=NULL)) + theme(legend.position="bottom", legend.direction = "horizontal", axis.text.x = element_text(angle = 45, hjust = 1), axis.title.x = element_text(face="italic"))
## Overall summary ##
cc_overall <- cc_sources %>%
select(-matches('site'))
cc_ancs_long = melt(cc_overall)
cc_overall_summary = melt(tapply(cc_ancs_long$value, cc_ancs_long$variable, mean), varnames="Source", value.name="Proportion")
cc_overall_summary = cc_overall_summary[order(-cc_overall_summary$Proportion),]
cc_overall_summary$Source = factor(cc_overall_summary$Source, levels=unique(as.character(cc_overall_summary$Source)))
cc_overall_plot = ggplot(cc_overall_summary) + geom_bar(aes(x=Source, y=Proportion), stat="identity", fill="black") + labs(x="C. coli", y="Proportion") + theme_grey(base_size = 14) + theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.title.x = element_text(face="italic"))
## Combined overall summary graph ##
overall_plot = plot_grid(cj_overall_plot, cc_overall_plot, labels=c("A", "B"), align="h")
# Save figure to output directory
ggsave("Figure1.svg", plot=overall_plot, device="svg", width=210, height=148, units="mm", dpi=300)
## Combined overall site-by-site summary plots ##
overall_sites_plot = plot_grid(cj_sbs_overall_plot, cc_sbs_overall_plot, labels=c("A", "B"), align="h")
ggsave("Figure2.svg", plot=overall_sites_plot, device="svg", width=210, height=148, units="mm", dpi=300)
## Combined overall summary table ##
overall_table = full_join(cj_overall_summary, cc_overall_summary, by = "Source")
colnames(overall_table) <- c("Source", "C. jejuni", "C. coli")
# Get significant figures
overall_table$`C. jejuni` = signif(overall_table$`C. jejuni`, 3)
overall_table$`C. coli` = signif(overall_table$`C. coli`, 3)
## Combined overall site-by-site summary table ##
sbs_table = merge(cj_sbs_summary, cc_sbs_summary, by.x="row.names", by.y="row.names", all=TRUE, suffixes = c(" (Cj)"," (Cc)"))
# Get significant figures
sbs_table[,-1] = signif(sbs_table[,-1], 3)
# Correct header
colnames(sbs_table)[1] <- "Sources"
##### INDIVIDUAL ANCESTRY PLOTS #####
### C. jejuni ###
# Retrieve sources dataframe
# Add "poprank" column that indicates the most likely source (in the case of a tie, 'which' will pick the first column)
cj_sources$poprank = colnames(cj_sources)[apply(cj_sources,1,which.max)]
# Add "max prob" column giving max value to order on later
cj_sources[, "maxprob"] = apply(within(cj_sources, rm(site, poprank)), 1, max)
# Reorder the dataframe from most to least common overall source, first by most likely source, and within each source
cj_order = order(cj_sources$site, ordered(cj_sources$poprank, levels=cj_overall_summary$Source), -cj_sources$maxprob)
cj_sources = cj_sources[cj_order,]
# Add rank column after ordering to give order to ggplot
cj_sources = as.data.frame(cj_sources %>% group_by(site) %>% mutate(rank = row_number(site)))
# Melt data to long form
cj_sources_long = melt(cj_sources, id=c("site","poprank","maxprob","rank"), variable.name = "Source", value.name = "Proportion")
# Generate individual ancestries plots
cj_ind_ancs_plot = ggplot(cj_sources_long, aes(x=rank, y=Proportion, fill=Source, width=1)) + geom_bar(stat="identity") + labs(x="Human disease isolates", y="Source probability") + facet_grid(site ~ .) + set_fill_colours + guides(fill=guide_legend(title="C. jejuni")) + theme(legend.title = element_text(face = "italic"))
### Repeat for C. coli ###
cc_sources$poprank = colnames(cc_sources)[apply(cc_sources,1,which.max)]
cc_sources[, "maxprob"] = apply(within(cc_sources, rm(site, poprank)), 1, max)
cc_order = order(cc_sources$site, ordered(cc_sources$poprank, levels=cc_overall_summary$Source), -cc_sources$maxprob)
cc_sources = cc_sources[cc_order,]
cc_sources = as.data.frame(cc_sources %>% group_by(site) %>% mutate(rank = row_number(site)))
cc_sources_long = melt(cc_sources, id=c("site","poprank","maxprob","rank"), variable.name = "Source", value.name = "Proportion")
# Generate individual ancestries plots
cc_ind_ancs_plot = ggplot(cc_sources_long, aes(x=rank, y=Proportion, fill=Source, width=1)) + geom_bar(stat="identity") + labs(x="Human disease isolates", y="Source probability") + facet_grid(site ~ .) + set_fill_colours + guides(fill=guide_legend(title="C. coli")) + theme(legend.title = element_text(face = "italic"))
## Combined individual ancestry plots ##
overall_ind_ancs_plot = plot_grid(cj_ind_ancs_plot, cc_ind_ancs_plot, labels = c("A", "B"), ncol = 1)
# Save figure to output directory
ggsave("Figure3.svg", plot=overall_ind_ancs_plot, device="svg", width=210, height=148, units="mm", dpi=300)
##### ANNUAL BREAKDOWN #####
### Annual attribution summary for C. jejuni ###
# Add years column to date-complete dataset
cj_years = cj
cj_years$Year = format(cj_years$date,format="%Y")
# Drop unnecessary columns
cj_years <- cj_years %>%
select(-matches('id')) %>%
select(-matches('species'))%>%
select(-matches('date'))
# Get averages by year
cj_years_mean = cj_years %>%
group_by(site, Year) %>%
dplyr::summarize_all(funs(mean))
# Melt data to long form
cj_years_mean_long = melt(cj_years_mean, id=c("site","Year"), variable.name = "Source", value.name = "Proportion")
# Re-order the dataframe so that bars will be stacked from major (bottom) to minor (top) sources
cj_years_order = order(ordered(cj_years_mean_long$Source, levels=cj_overall_summary$Source), decreasing = TRUE)
cj_years_mean_long = cj_years_mean_long[cj_years_order,]
cj_years_mean_long$Source = factor(cj_years_mean_long$Source, levels=unique(as.character(cj_years_mean_long$Source)))
# Force Year to be numeric so that graph will plot
cj_years_mean_long$Year = as.numeric(cj_years_mean_long$Year)
# Generate area plot and manipulate x-axis to display years correctly
cj_years_plot = ggplot(cj_years_mean_long, aes(x=Year, y=Proportion, fill=Source)) + geom_area() + scale_x_continuous(breaks=as.numeric(unique(cj_years_mean_long$Year)), labels=c(as.character(unique(cj_years_mean_long$Year)))) + labs(x="Time (years)", y="Proportion of isolates") + set_fill_colours + guides(fill=guide_legend(title="C. jejuni")) + theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.title = element_text(face = "italic")) + facet_grid(site ~ .)
### Annual attribution summary for C. coli ###
cc_years = cc
cc_years$Year = format(cc_years$date,format="%Y")
cc_years <- cc_years %>%
select(-matches('id')) %>%
select(-matches('species'))%>%
select(-matches('date'))
cc_years_mean = cc_years %>%
group_by(site, Year) %>%
dplyr::summarize_all(funs(mean))
cc_years_mean_long = melt(cc_years_mean, id=c("site","Year"), variable.name = "Source", value.name = "Proportion")
cc_years_order = order(ordered(cc_years_mean_long$Source, levels=cc_overall_summary$Source), decreasing = TRUE)
cc_years_mean_long = cc_years_mean_long[cc_years_order,]
cc_years_mean_long$Source = factor(cc_years_mean_long$Source, levels=unique(as.character(cc_years_mean_long$Source)))
cc_years_mean_long$Year = as.numeric(cc_years_mean_long$Year)
# Generate area plot and manipulate x-axis to display years correctly
cc_years_plot = ggplot(cc_years_mean_long, aes(x=Year, y=Proportion, fill=Source)) + geom_area() + scale_x_continuous(breaks=as.numeric(unique(cc_years_mean_long$Year)), labels=c(as.character(unique(cc_years_mean_long$Year)))) + labs(x="Time (years)", y="Proportion of isolates") + set_fill_colours + guides(fill=guide_legend(title="C. coli")) + theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.title = element_text(face = "italic")) + facet_grid(site ~ .)
## Combined annual attribution summary plot ##
yearly_plot = plot_grid(cj_years_plot, cc_years_plot, labels=c("A", "B"), ncol = 1)
# Save figure to output directory
ggsave("Figure5.svg", plot=yearly_plot, device="svg", width=210, height=148, units="mm", dpi=300)
## Annual attribution tables for C. jejuni ##
# Reshape the long form table
cj_years_mean_table = dcast(setDT(cj_years_mean), Year ~ site, value.var=c(as.character(unique(cj_overall_summary$Source))))
# Shorten the suffixes for ease of reading
names(cj_years_mean_table) = gsub(x = names(cj_years_mean_table), pattern = "_Newcastle", replacement = " (NWC)")
names(cj_years_mean_table) = gsub(x = names(cj_years_mean_table), pattern = "_Oxfordshire", replacement = " (OXC)")
# Reorder columns so that sites are grouped
cj_years_col_order = order(grepl("^.*OXC.*$", colnames(cj_years_mean_table)))
cj_years_mean_table = cj_years_mean_table[, ..cj_years_col_order]
## Annual attribution tables for C. coli ##
cc_years_mean_table = dcast(setDT(cc_years_mean), Year ~ site, value.var=c(as.character(unique(cc_overall_summary$Source))))
# Shorten the suffixes for ease of reading
names(cc_years_mean_table) = gsub(x = names(cc_years_mean_table), pattern = "_Newcastle", replacement = " (NWC)")
names(cc_years_mean_table) = gsub(x = names(cc_years_mean_table), pattern = "_Oxfordshire", replacement = " (OXC)")
# Reorder columns so that sites are grouped
cc_years_col_order = order(grepl("^.*OXC.*$", colnames(cc_years_mean_table)))
cc_years_mean_table = cc_years_mean_table[, ..cc_years_col_order]
### Annual counts for C. jejuni ###
cj_years_count = cj_years %>%
group_by(site, Year) %>%
summarise(total.count=n())
# Add species column and rename columns
cj_years_count$Species = "C. jejuni"
colnames(cj_years_count) <- c("Site", "Year", "Count", "Species")
### Annual counts for C. coli ###
cc_years_count = cc_years %>%
group_by(site, Year) %>%
summarise(total.count=n())
cc_years_count$Species = "C. coli"
colnames(cc_years_count) <- c("Site", "Year", "Count", "Species")
## Combined annual counts plot ##
# Join C .jejuni and C. coli annual counts for plotting
overall_years_count = union(cj_years_count, cc_years_count)
# Convert years to integers for graphing
overall_years_count$Year = as.integer(overall_years_count$Year)
# Generate annual counts plot
overall_years_count_plot = ggplot(data=overall_years_count, aes(x = Year, y = Count, group = Site)) + geom_line(aes(linetype=Site)) + scale_x_continuous(breaks=as.numeric(unique(overall_years_count$Year)), labels=c(as.character(unique(overall_years_count$Year))), expand=c(0,0)) + labs(x="Time (years)", y="Number of isolates") + theme_grey(base_size = 14) + theme(axis.text.x = element_text(angle = 45, hjust = 1), strip.text = element_text(face = "italic")) + facet_grid(Species ~ ., scales = "free")
## Combined annual counts table ##
cj_years_count_report <- as.data.frame(cj_years_count) %>%
select(-matches('Species'))
cj_years_count_report = dcast(cj_years_count_report, Year ~ Site, value.var="Count")
cc_years_count_report <- as.data.frame(cc_years_count) %>%
select(-matches('Species'))
cc_years_count_report = dcast(cc_years_count_report, Year ~ Site, value.var="Count")
overall_years_count_table = merge(cj_years_count_report, cc_years_count_report, by.x="Year", by.y="Year", all=TRUE, suffixes = c(" (Cj)"," (Cc)"))
##### QUARTERLY BREAKDOWN #####
# Define functions for plotting quarterly data
# Code adapted from https://stackoverflow.com/questions/31198144/formatting-a-scale-x-continuous-axis-with-quarterly-data
# Convert quarter to uniform date
make_date <- function(x) {
year <- floor(x)
x <- year + (x - year)/0.4 - 0.125
as.Date(as.yearqtr(x))
}
# Set up formatting for x-axis labels
format_quarters <- function(x) {
x <- as.yearqtr(x)
year <- as.integer(x)
quart <- as.integer(format(x, "%q"))
paste(c("Jan-Mar","Apr-Jun","Jul-Sep","Oct-Dec")[quart],
year)
}
### Quarterly attribution summary for C. jejuni ###
# Add quarters column to date-complete dataset
cj_quarters = cj
cj_quarters$Quarter = as.yearqtr(cj_quarters$date, format = "%Y-%m-%d")
# Coerce quarter into graphable format
cj_quarters$Quarter = format(cj_quarters$Quarter, format = "%Y.%q")
# Drop unnecessary columns
cj_quarters <- cj_quarters %>%
select(-matches('id')) %>%
select(-matches('species')) %>%
select(-matches('date'))
# Get mean by quarter over time
cj_quarters_mean = cj_quarters %>%
group_by(site, Quarter) %>%
dplyr::summarize_all(funs(mean))
# Convert to long form
cj_quarters_mean_long = melt(cj_quarters_mean, id = c("site", "Quarter"), variable.name = "Source", value.name = "Proportion")
# Re-order
cj_quarters_order = order(ordered(cj_quarters_mean_long$Source, levels=cj_overall_summary$Source), decreasing = TRUE)
cj_quarters_mean_long = cj_quarters_mean_long[cj_quarters_order,]
cj_quarters_mean_long$Source = factor(cj_quarters_mean_long$Source, levels=unique(as.character(cj_quarters_mean_long$Source)))
# Generate area plot
cj_quarters_plot = ggplot(cj_quarters_mean_long, aes(x=make_date(as.numeric(Quarter)), y=Proportion, fill=Source)) + geom_area() + set_fill_colours + labs(x="Time (years and calendar quarters)", y="Proportion of isolates") + guides(fill=guide_legend(title="C. jejuni")) + scale_x_date(breaks=date_breaks("3 months"), expand=c(0,0), labels=format_quarters) + theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.title = element_text(face = "italic")) + facet_grid(site ~ .)
### Quarterly attribution summary for C. coli ###
cc_quarters = cc
cc_quarters$Quarter = as.yearqtr(cc_quarters$date, format = "%Y-%m-%d")
cc_quarters$Quarter = format(cc_quarters$Quarter, format = "%Y.%q")
cc_quarters <- cc_quarters %>%
select(-matches('id')) %>%
select(-matches('species')) %>%
select(-matches('date'))
cc_quarters_mean = cc_quarters %>%
group_by(site, Quarter) %>%
dplyr::summarize_all(funs(mean))
cc_quarters_mean_long = melt(cc_quarters_mean, id = c("site", "Quarter"), variable.name = "Source", value.name = "Proportion")
cc_quarters_order = order(ordered(cc_quarters_mean_long$Source, levels=cc_overall_summary$Source), decreasing = TRUE)
cc_quarters_mean_long = cc_quarters_mean_long[cc_quarters_order,]
cc_quarters_mean_long$Source = factor(cc_quarters_mean_long$Source, levels=unique(as.character(cc_quarters_mean_long$Source)))
# Generate area plot
cc_quarters_plot = ggplot(cc_quarters_mean_long, aes(x=make_date(as.numeric(Quarter)), y=Proportion, fill=Source)) + geom_area() + set_fill_colours + labs(x="Time (years and calendar quarters)", y="Proportion of isolates") + guides(fill=guide_legend(title="C. coli")) + scale_x_date(breaks=date_breaks("3 months"), expand=c(0,0), labels=format_quarters) + theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.title = element_text(face = "italic")) + facet_grid(site ~ .)
## Combined quarterly attribution plot ##
quarterly_plot = plot_grid(cj_quarters_plot, cc_quarters_plot, labels=c("A", "B"), nrow = 2, align = "v")
# Save figure to output directory
ggsave("Figure6.svg", plot=quarterly_plot, device="svg", width=210, height=148, units="mm", dpi=300)
### Quarterly attribution table for C. jejuni ###
# Reshape the long form table
cj_quarters_mean_table = dcast(setDT(cj_quarters_mean), Quarter ~ site, value.var=c(as.character(unique(cj_overall_summary$Source))))
# Shorten the suffixes for ease of reading
names(cj_quarters_mean_table) = gsub(x = names(cj_quarters_mean_table), pattern = "_Newcastle", replacement = " (NWC)")
names(cj_quarters_mean_table) = gsub(x = names(cj_quarters_mean_table), pattern = "_Oxfordshire", replacement = " (OXC)")
# Reorder columns so that sites are grouped
cj_quarters_col_order = order(grepl("^.*OXC.*$", colnames(cj_quarters_mean_table)))
cj_quarters_mean_table = cj_quarters_mean_table[, ..cj_quarters_col_order]
### Quarterly attribution table for C. coli ###
cc_quarters_mean_table = dcast(setDT(cc_quarters_mean), Quarter ~ site, value.var=c(as.character(unique(cc_overall_summary$Source))))
names(cc_quarters_mean_table) = gsub(x = names(cc_quarters_mean_table), pattern = "_Newcastle", replacement = " (NWC)")
names(cc_quarters_mean_table) = gsub(x = names(cc_quarters_mean_table), pattern = "_Oxfordshire", replacement = " (OXC)")
cc_quarters_col_order = order(grepl("^.*OXC.*$", colnames(cc_quarters_mean_table)))
cc_quarters_mean_table = cc_quarters_mean_table[, ..cc_quarters_col_order]
### Quarterly counts for C. jejuni ###
cj_quarters_count = cj_quarters %>%
group_by(site, Quarter) %>%
summarise(total.count=n())
# Add species column and rename columns
cj_quarters_count$Species = "C. jejuni"
colnames(cj_quarters_count) <- c("Site", "Quarter", "Count", "Species")
### Quarterly counts for C. coli ###
cc_quarters_count = cc_quarters %>%
group_by(site, Quarter) %>%
summarise(total.count=n())
cc_quarters_count$Species = "C. coli"
colnames(cc_quarters_count) <- c("Site", "Quarter", "Count", "Species")
# Combined quarterly counts for plotting
overall_quarters_count = union(cj_quarters_count, cc_quarters_count)
# Generate quarterly counts plot
overall_quarters_count_plot = ggplot(data=overall_quarters_count, aes(x = make_date(as.numeric(Quarter)), y = Count, group = Site)) + geom_line(aes(linetype=Site)) + labs(x="Time (years and calendar quarters)", y="Number of isolates") + scale_x_date(breaks=date_breaks("3 months"), expand=c(0,0), labels=format_quarters) + theme_grey(base_size = 14) + theme(axis.text.x = element_text(angle = 45, hjust = 1), strip.text = element_text(face = "italic")) + facet_grid(Species ~ ., scales = "free")
## Combined annual and quarterly counts plot ##
overall_counts_combined = plot_grid(overall_years_count_plot, overall_quarters_count_plot, labels=c("A", "B"), nrow = 2, align = "v")
# Save plot
ggsave("Figure4.svg", plot=overall_counts_combined, device="svg", width=210, height=148, units="mm", dpi=300)
## Combined quarterly counts table ##
cj_quarters_count_report <- as.data.frame(cj_quarters_count) %>%
select(-matches('Species'))
cj_quarters_count_report = dcast(cj_quarters_count_report, Quarter ~ Site, value.var="Count")
cc_quarters_count_report <- as.data.frame(cc_quarters_count) %>%
select(-matches('Species'))
cc_quarters_count_report = dcast(cc_quarters_count_report, Quarter ~ Site, value.var="Count")
overall_quarters_count_table = merge(cj_quarters_count_report, cc_quarters_count_report, by.x="Quarter", by.y="Quarter", all=TRUE, suffixes = c(" (Cj)"," (Cc)"))
###############################################################################
##### REPORT GENERATION #####
###############################################################################
##### CREATE DOCUMENT AND TITLE #####
doc = docx(title="FSA_Report_Skeleton")
doc = addTitle(doc , 'FS101013: Source attribution of campylobacteriosis isolates from Newcastle/North Tyneside and Oxfordshire', level=1)
##### INTRODUCTION #####
doc = addTitle(doc, 'Introduction', level=2)
doc = addParagraph(doc, 'This is an automated report summarizing the population genetic assignment of Campylobacter isolates from human disease cases in Newcastle/North Tyneside and Oxfordshire, UK. Isolates were assigned to putative sources based on population genetic analysis using STRUCTURE (1) or iSource (2) software, as selected by the user. The reporting system was developed using funding from the Food Standards Agency (FS101013) and the NIHR Health Protection Research Unit in Gastrointestinal Infections.', stylename = 'Normal')
##### METHODS #####
doc = addTitle(doc , 'Methods', level=2)
### Describe datasets ###
doc = addTitle(doc, 'Datasets', level=3)
doc = addParagraph(doc, 'Attribution analyses were carried out using data stored in the PubMLST database (https://pubmlst.org/ campylobacter/) (3).', stylename = 'Normal')
## Reference sets ##
doc = addTitle(doc, 'Reference isolates', level=4)
doc = addParagraph(doc, 'Validated C. jejuni (n = 7715) and C. coli (n = 3521) reference sets were exported from the PubMLST database in July 2017 and remain unchanged (Jansen van Rensburg et al., In preparation).', stylename = 'Normal')
## Human disease sets ##
doc = addTitle(doc, 'Human disease isolates', level=4)
# Print simpler statement if no missing data detected
if (((cj_nwc_missing_dates + cj_oxc_missing_dates)==0)&((cc_nwc_missing_dates + cc_oxc_missing_dates)==0)) {
doc = addParagraph(doc, sprintf('A total of %s C. jejuni and %s C. coli isolates, collected between %s and %s in Newcastle/North Tyneside and Oxfordshire, were attributed to animal and/or environmental sources.', no_ancs_cj, no_ancs_cc, min_date, max_date), stylename = 'Normal')
} else {
doc = addParagraph(doc, sprintf('A total of %s C. jejuni and %s C. coli isolates, collected between %s and %s in Newcastle/North Tyneside and Oxfordshire, were attributed to animal and/or environmental sources; however, isolates without dates of isolation/laboratory receipt dates were excluded from all date-based analyses presented in this report. Following data cleaning, %s and %s C. jejuni from Newcastle/North Tyneside and Oxfordshire, respectively, were excluded, as were %s and %s C. coli from Newcastle/North Tyneside and Oxfordshire.', no_ancs_cj, no_ancs_cc, min_date, max_date, cj_nwc_missing_dates, cj_oxc_missing_dates, cc_nwc_missing_dates, cc_oxc_missing_dates), stylename = 'Normal')
}
### Describe attribution ###
doc = addTitle(doc, 'Source attribution', level=3)
doc = addParagraph(doc, 'Isolates were assigned to putative host sources using STRUCTURE or iSource, based on analysis of MLST data. The algorithms were run separately for C. jejuni and C. coli. For STRUCTURE analyses, the no-admixture model was used and the program was run using a burn-in period of 1,000 cycles followed by 10,000 iterations. For iSource, the Asymmetric Island model was used and the program was run for 10,000 iterations without thinning, using a symmetric Dirichlet prior.', stylename = 'Normal')
doc = addParagraph(doc, 'The probabilities of attribution to each source are summed as an arithmetic mean across the cases of human infection (Figures 1 and 2), presented for individual isolates (Figure 3), and summed over time (Figure 5 onwards).', stylename = 'Normal')
doc = addParagraph(doc, 'The attribution carries the assumption that all isolates came from one of the sources in the analysis. Isolates from sources not represented will be assigned to sources present in the reference sets according to their genetic similarity. The results presented do not include any adjustment for bias that may occur in population genetic attribution with the included reference isolates. Results should be considered in the light of the results of the reference set validation study (Cody et al., In preparation; Jansen van Rensburg et al., In preparation).', stylename = 'Normal')
doc <- addPageBreak(doc)
##### RESULTS #####
doc = addTitle(doc, 'Results', level=2)
### Overall summary ###
doc = addTitle(doc, 'Overall summary', level=3)
doc = addParagraph(doc, 'Figures 1 and 2 show the overall and site-by-site proportions attributed to each source in the reference set, respectively. Figure 3 shows the probabilistic assignments of individual isolates. Tabulated data for Figures 1 and 2 are provided in the Appendices.', stylename = 'Normal')
doc = addParagraph(doc, 'Tabulated data for Figures 1 and 2 are provided in the Appendices.', stylename = 'Normal')
# Overall summary plot
doc = addPlot(doc , fun=print, x=overall_plot, width=6, height=4)
doc = addParagraph(doc, sprintf('Figure 1. Estimated proportion of human disease isolates attributed to putative sources. Probabilistic assignment of (A) %s C. jejuni and (B) %s C. coli human disease isolates collected between %s and %s in Newcastle/North Tyneside and Oxfordshire.', no_ancs_cj, no_ancs_cc, min_date, max_date), stylename='Normal')
# Overall site-by-site plot
doc = addPlot(doc, fun=print, x=overall_sites_plot, width=6, height=4)
doc = addParagraph(doc, sprintf('Figure 2. Estimated proportion of human disease isolates attributed to putative sources. Probabilistic assignment of (A) C. jejuni collected in Newcastle/North Tyneside (n = %s) and Oxfordshire (n = %s) between %s and %s, and (B) C. coli collected in Newcastle/North Tyneside (n = %s) and Oxfordshire (n = %s) between %s and %s.', all_cj_nwc, all_cj_oxc, min_date_cj, max_date_cj, all_cc_nwc, all_cc_oxc, min_date_cc, max_date_cc), stylename='Normal')
# Overall individual ancestries plot
doc = addPlot(doc , fun=print, x=overall_ind_ancs_plot, width=6.5, height=6)
doc = addParagraph(doc, sprintf('Figure 3. Source probabilities for individual human disease isolates. Probabilistic assignment of (A) C. jejuni isolates from Newcastle/North Tyneside (n = %s) and Oxfordshire (n = %s), and (B) C. coli isolates from Newcastle/North Tyneside (n = %s) and Oxfordshire (n = %s). Isolates are represented as vertical bars coloured according to the estimated probability for each source, as shown in the legends. Isolates are ordered horizontally to aid visualisation, first by most likely source and then by decreasing probability within each source.', all_cj_nwc, all_cj_oxc, all_cc_nwc, all_cc_oxc), stylename='Normal')
doc <- addPageBreak(doc)
### Isolate counts ###
doc = addTitle(doc, 'Breakdown of isolates over time', level=3)
doc = addParagraph(doc, 'Tabulated data are provided in the Appendices.', stylename = 'Normal')
# Isolate count over years and quarters plot
doc = addPlot(doc , fun=print, x=overall_counts_combined, width=6.5, height=7)
doc = addParagraph(doc, 'Figure 4. Number of isolates collected in Newcastle/North Tyneside and Oxfordshire per year (A), and per year and calendar quarter (B).', stylename='Normal')
doc <- addPageBreak(doc)
### Annual breakdown ###
doc = addTitle(doc, 'Annual attribution breakdown', level=3)
doc = addParagraph(doc, 'Tabulated data are provided in the Appendices.', stylename = 'Normal')
# Annual attribution plot
doc = addPlot(doc , fun=print, x=yearly_plot, width=6.5, height=6.5)
doc = addParagraph(doc, sprintf('Figure 5. Estimated proportion of human disease isolates attributed to putative sources per year. Probabilistic assignment of (A) C. jejuni isolates from Newcastle/North Tyneside (n = %s) and Oxfordshire (n = %s), and (B) C. coli isolates from Newcastle/North Tyneside (n = %s) and Oxfordshire (n = %s). Bars are ordered from major (bottom) to minor (top) sources based on the overall proportions shown in Figure 1.', dated_cj_nwc, dated_cj_oxc, dated_cc_nwc, dated_cc_oxc), stylename='Normal')
doc <- addPageBreak(doc)
### Quarterly breakdown ###
doc = addTitle(doc, 'Quarterly breakdown', level=3)
doc = addParagraph(doc, 'Tabulated data are provided in the Appendices.', stylename = 'Normal')
# Quarterly attribution plot
doc = addPlot(doc , fun=print, x=quarterly_plot, width=6.5, height=7.5)
doc = addParagraph(doc, sprintf('Figure 6. Estimated proportion of human disease isolates attributed to putative sources per calendar quarter. Probabilistic assignment of (A) C. jejuni isolates from Newcastle/North Tyneside (n = %s) and Oxfordshire (n = %s), and (B) C. coli isolates from Newcastle/North Tyneside (n = %s) and Oxfordshire (n = %s). Bars are ordered from major (bottom) to minor (top) sources based on the overall proportions shown in Figure 1.', dated_cj_nwc, dated_cj_oxc, dated_cc_nwc, dated_cc_oxc), stylename='Normal')
doc <- addPageBreak(doc)
##### REFERENCES #####
doc = addTitle(doc, 'References', level=2)
doc = addParagraph(doc, '1. Pritchard, J.K., Stephens, M. and Donnelly, P., 2000. Inference of population structure using multilocus genotype data. Genetics, 155(2), pp.945-959.', stylename = 'Normal')
doc = addParagraph(doc, '2. Wilson, D.J., Gabriel, E., Leatherbarrow, A.J., Cheesbrough, J., Gee, S., Bolton, E., Fox, A., Fearnhead, P., Hart, C.A. and Diggle, P.J., 2008. Tracing the source of campylobacteriosis. PLoS Genetics, 4(9), p.e1000203.', stylename = 'Normal')
doc = addParagraph(doc, '3. Jolley, K.A. and Maiden, M.C., 2010. BIGSdb: scalable analysis of bacterial genome variation at the population level. BMC bioinformatics, 11(1), p.595.', stylename = 'Normal')
doc <- addPageBreak(doc)
##### APPENDICES #####
doc = addTitle(doc, 'Appendices', level=2)
### Overall summary ###
# Overall summary table
doc = addTitle(doc, 'Overall summary', level=3)
doc = addParagraph(doc, sprintf('Table A1. Attribution to putative sources of %s C. jejuni and %s C. coli human disease isolates collected between %s and %s in Newcastle/North Tyneside and Oxfordshire.', no_cj, no_cc, min_date, max_date), stylename='Normal')
doc = addFlexTable(doc, vanilla.table(overall_table))
doc = addParagraph(doc, '\r\n', stylename=)
# Overall site-by-site table
doc = addParagraph(doc, sprintf('Table A2. Attribution to putative sources of C. jejuni (Cj) and C. coli (Cc) human disease isolates from Newcastle/North Tyneside (n = %s; n = %s) and Oxfordshire (n = %s; n = %s).', all_cj_nwc, all_cc_nwc, all_cj_oxc, all_cc_oxc), stylename='Normal')
doc = addFlexTable(doc, vanilla.table(sbs_table))
doc = addParagraph(doc, '\r\n', stylename=)
doc <- addPageBreak(doc)
### Isolate counts ###
doc = addTitle(doc, 'Breakdown of isolates over time', level=3)
# Breakdown of number of isolates per year
doc = addParagraph(doc, sprintf('Table A3. Number of C. jejuni (Cj) and C. coli (Cc) human disease isolates collected in Newcastle/North Tyneside and Oxfordshire per year between %s and %s.', min_date, max_date), stylename='Normal')
doc = addFlexTable(doc, vanilla.table(overall_years_count_table))
doc = addParagraph(doc, '\r\n', stylename=)
# Breakdown of number of isolates per quarter
# Convert quarters into readable format that matches plots
overall_quarters_count_table$Quarter = format_quarters(make_date(as.numeric(overall_quarters_count_table$Quarter)))
doc = addParagraph(doc, 'Table A4. Number of C. jejuni (Cj) and C. coli (Cc) human disease isolates per quarter', stylename='Normal')
doc = addFlexTable(doc, vanilla.table(overall_quarters_count_table))
doc = addParagraph(doc, '\r\n', stylename=)
doc <- addPageBreak(doc)
### Annual breakdown ###
doc = addTitle(doc, 'Annual breakdown', level=3)
# Breakdown of attribution per year
# C. jejuni
# Get significant figures
cj_years_mean_table <- cj_years_mean_table %>%
mutate_if(is.numeric, funs(signif(., 3)))
# Display table
doc = addParagraph(doc, sprintf('Table A5. Proportion of C. jejuni isolates from Newcastle/North Tyneside (NWC) (n = %s) and Oxfordshire (OXC) (n = %s) attributed to putative sources per year between %s and %s.', dated_cj_nwc, dated_cj_oxc, min_date_cj, max_date_cj), stylename='Normal')
doc = addFlexTable(doc, vanilla.table(cj_years_mean_table))
doc = addParagraph(doc, '\r\n', stylename=)
# C. coli
# Get significant figures
cc_years_mean_table <- cc_years_mean_table %>%
mutate_if(is.numeric, funs(signif(., 3)))
# Display table
doc = addParagraph(doc, sprintf('Table A6. Proportion of C. coli isolates from Newcastle/North Tyneside (NWC) (n = %s) and Oxfordshire (OXC) (n = %s) attributed to putative sources per year between %s and %s.', dated_cc_nwc, dated_cc_oxc, min_date_cc, max_date_cc), stylename='Normal')
doc = addFlexTable(doc, vanilla.table(cc_years_mean_table))
doc = addParagraph(doc, '\r\n', stylename=)
doc <- addPageBreak(doc)
### Quarterly breakdown ###
doc = addTitle(doc, 'Quarterly breakdown', level=3)
# Breakdown of attribution per quarter
# C. jejuni
# Get significant figures
cj_quarters_mean_table <- cj_quarters_mean_table %>%
mutate_if(is.numeric, funs(signif(., 3)))
# Convert quarters into readable format that matches plots
cj_quarters_mean_table$Quarter = format_quarters(make_date(as.numeric(cj_quarters_mean_table$Quarter)))
# Display table
doc = addParagraph(doc, sprintf('Table A7. Proportion of C. jejuni isolates from Newcastle/North Tyneside (NWC) (n = %s) and Oxfordshire (OXC) (n = %s) attributed to putative sources per calendar quarter between %s and %s.', dated_cj_nwc, dated_cj_oxc, min_date_cj, max_date_cj), stylename='Normal')
doc = addFlexTable(doc, vanilla.table(cj_quarters_mean_table))
doc = addParagraph(doc, '\r\n', stylename=)
# C. coli
# Get significant figures
cc_quarters_mean_table <- cc_quarters_mean_table %>%
mutate_if(is.numeric, funs(signif(., 3)))
# Convert quarters into readable format that matches plots
cc_quarters_mean_table$Quarter = format_quarters(make_date(as.numeric(cc_quarters_mean_table$Quarter)))
# Display table
doc = addParagraph(doc, sprintf('Table A8. Proportion of C. coli isolates from Newcastle/North Tyneside (NWC) (n = %s) and Oxfordshire (OXC) (n = %s) attributed to putative sources per calendar quarter between %s and %s.', dated_cc_nwc, dated_cc_oxc, min_date_cc, max_date_cc), stylename='Normal')
doc = addFlexTable(doc, vanilla.table(cc_quarters_mean_table))
doc = addParagraph(doc, '\r\n', stylename=)
writeDoc(doc, "FSA_Report_Skeleton.docx")