plot-tributR_FSA.r

# This code produces an automated MS Word report based on attribution analyses of C. jejuni and C. coli. It requires as input the inferred ancestries files generated by the bigsdb_attributor package (one each for C. jejuni and C. coli)
# Developed using funding from the Food Standards Agency (FS101013) and the NIHR Health Protection Research Unit in Gastrointestinal Infections
# Written by Melissa Jansen van Rensburg (2018)

#TODO: In py script head id column in ancs output; add date column (filling as per usual)
#TODO: fill site automatically
#TODO: fix human fill problem in attributor

library(ReporteRs)
library(dplyr)
library(reshape2)
library(ggplot2)
library(cowplot)
library(RColorBrewer)
library(zoo)
library(data.table)
library(scales)


##### SET-UP #####
### Provide paths to input files ###
# C. jejuni
cj_input_data = "/Users/MelissaJvR/Desktop/FSA_Final/RInputFiles/FSA_Cj_inferred-ancestry.csv"
# C. coli
cc_input_data = "/Users/MelissaJvR/Desktop/FSA_Final/RInputFiles/FSA_Cc_inferred-ancestry.csv"

# Define colour palette
set_fill_colours = scale_fill_manual(values = c("Cattle" = "#6FBAFF", "Chicken" = "#FFF697", "Environment" = "#A4FF53", "Sheep" = "#B3B3B3", "Turkey" = "#000000", "Duck" = "#FF9E36", "Goose" = "#FF6666", "Pig" = "#ffc0e9", "Wild.bird" = "#803C20", "Other animal" = "#800080", "Dog" = "#008080", "Ruminant" = "#6FBAFF"))
# Alternative qualitative palette
# If this is used, you must ensure that colour mapping is consistent across all graphs (some editing of the script will be required)
# set_fill_colours = scale_fill_brewer(type="qual", palette="Set3", direction=1)


##### FILE PROCESSING #####
# Open C. jejuni file, convert missing data to NA and drop rows without dates
# Note that PubMLST dates are currently in ISO 8601 format (for xx/xx/xxxx use format=format='%d-%m-%Y')
ancs_cj = read.csv(cj_input_data, header=TRUE, check.names=TRUE)
ancs_cj[ancs_cj==""] <- NA
ancs_cj$date <- as.Date(ancs_cj$date)
cj = ancs_cj[!is.na(ancs_cj$date),]
# Repeat for C. coli
ancs_cc = read.csv(cc_input_data, header=TRUE, check.names=TRUE)
ancs_cc[ancs_cc==""] <- NA
ancs_cc$date <- as.Date(ancs_cc$date)
cc = ancs_cc[!is.na(ancs_cc$date),]


##### DIRECTORY SET-UP #####
# Get current directory and name output directory
# NOTE: the script will fail here if the output directory already exists.  Remove/rename the existing directory and try again.  If you've deleted the file, make sure that your current working directory isn't set to Trash!  To do this, enter "getwd()" and then run the command "setwd(PATH/TO/DESIRED/DIRECTORY)" to move to the right directory.
current_dir = sprintf("%s",getwd())
output_dir = "AttributionReportFSA"
# Create output diretory
dir.create(file.path(current_dir, output_dir))
# Set output directory as current directory
setwd(file.path(current_dir, output_dir))


##### DATA PROCESSING #####
### Isolate counts for original and cleaned datasets ###
# C. jejuni
no_ancs_cj = nrow(ancs_cj)
no_cj = nrow(cj)
cj_missing_dates = no_ancs_cj - no_cj
cj_missing_dates_prop = round((cj_missing_dates/no_ancs_cj*100), digits=2)
all_cj_oxc = sum(ancs_cj$site == 'Oxfordshire')
all_cj_nwc = sum(ancs_cj$site == 'Newcastle')
dated_cj_oxc = sum(cj$site == 'Oxfordshire')
dated_cj_nwc = sum(cj$site == 'Newcastle')
cj_oxc_missing_dates = all_cj_oxc - dated_cj_oxc 
cj_nwc_missing_dates = all_cj_nwc - dated_cj_nwc
# C. coli
no_ancs_cc = nrow(ancs_cc)
no_cc = nrow(cc)
cc_missing_dates = no_ancs_cc - no_cc
cc_missing_dates_prop = round((cc_missing_dates/no_ancs_cc*100), digits=2)
all_cc_oxc = sum(ancs_cc$site == 'Oxfordshire')
all_cc_nwc = sum(ancs_cc$site == 'Newcastle')
dated_cc_oxc = sum(cc$site == 'Oxfordshire')
dated_cc_nwc = sum(cc$site == 'Newcastle')
cc_oxc_missing_dates = all_cc_oxc - dated_cc_oxc 
cc_nwc_missing_dates = all_cc_nwc - dated_cc_nwc

### Max and min dates ###
# C. jejuni
max_date_cj = format(max(as.Date(cj$date, format="%d/%m/%Y")), "%B %Y")
min_date_cj = format(min(as.Date(cj$date, format="%d/%m/%Y")), "%B %Y")
# C. coli
max_date_cc = format(max(as.Date(cc$date, format="%d/%m/%Y")), "%B %Y")
min_date_cc = format(min(as.Date(cc$date, format="%d/%m/%Y")), "%B %Y")
# Overall
max_date = format(max(as.Date(dplyr::bind_rows(cj, cc)$date, format="%d/%m/%Y")), "%B %Y")
min_date = format(min(as.Date(dplyr::bind_rows(cj, cc)$date, format="%d/%m/%Y")), "%B %Y")


##### OVERALL SUMMARIES #####
### Summary plots and tables for C. jejuni ###
## Site-by-site summaries ##
# Get only source columns for overall summaries
cj_sources <- ancs_cj %>%
	select(-matches('id')) %>%
	select(-matches('date')) %>%
	select(-matches('species'))
# Get proportions for each source, site by site and convert to long form
cj_anc_sites = (aggregate(.~site, cj_sources, mean))
cj_anc_sites_long = melt(cj_anc_sites, id="site", variable.name="Source", value.name="Proportion")
# Create site-by-site table
cj_sbs_summary = setNames(data.frame(t(cj_anc_sites[,-1])), cj_anc_sites[,1])

# Generate grouped bar graph showing site-by-site proportions
cj_sbs_overall_plot = ggplot(cj_anc_sites_long, aes(x = reorder(Source, -Proportion), y = Proportion)) + geom_bar(aes(fill = site), stat="identity", position = position_dodge(width=0.8), width = 0.8) + scale_fill_grey() + theme_grey(base_size = 14) + labs(x="C. jejuni", y="Proportion") + guides(fill=guide_legend(title=NULL)) + theme(legend.position="bottom", legend.direction = "horizontal", axis.text.x = element_text(angle = 45, hjust = 1), axis.title.x = element_text(face="italic"))

## Overall summary ##
# Drop sites to get overall proportions
cj_overall <- cj_sources %>%
	select(-matches('site'))
# Get proportions for each source
cj_ancs_long = melt(cj_overall)
cj_overall_summary = melt(tapply(cj_ancs_long$value, cj_ancs_long$variable, mean), varnames="Source", value.name="Proportion")
# Set order: by source, decreasing proportion
cj_overall_summary = cj_overall_summary[order(-cj_overall_summary$Proportion),]
cj_overall_summary$Source = factor(cj_overall_summary$Source, levels=unique(as.character(cj_overall_summary$Source)))
# Generate bar graph showing overall proportions
cj_overall_plot = ggplot(cj_overall_summary) + geom_bar(aes(x=Source, y=Proportion), stat="identity", fill="black") + labs(x="C. jejuni", y="Proportion") + theme_grey(base_size = 14) + theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.title.x = element_text(face="italic"))

### Summary plots and tables for C. coli ###
## Site-by-site summaries ##
cc_sources <- ancs_cc %>%
	select(-matches('id')) %>%
	select(-matches('date')) %>%
	select(-matches('species'))
cc_anc_sites = (aggregate(.~site, cc_sources, mean))
cc_anc_sites_long = melt(cc_anc_sites, id="site", variable.name="Source", value.name="Proportion")
# Site-by-site summary
cc_sbs_summary = setNames(data.frame(t(cc_anc_sites[,-1])), cc_anc_sites[,1])
# Site-by-site plot
cc_sbs_overall_plot = ggplot(cc_anc_sites_long, aes(x = reorder(Source, -Proportion), y = Proportion)) + geom_bar(aes(fill = site), stat="identity", position = position_dodge(width=0.8), width = 0.8) + scale_fill_grey() + theme_grey(base_size = 14) + labs(x="C. coli", y="Proportion") + guides(fill=guide_legend(title=NULL)) + theme(legend.position="bottom", legend.direction = "horizontal", axis.text.x = element_text(angle = 45, hjust = 1), axis.title.x = element_text(face="italic"))

## Overall summary ##
cc_overall <- cc_sources %>%
	select(-matches('site'))
cc_ancs_long = melt(cc_overall)
cc_overall_summary = melt(tapply(cc_ancs_long$value, cc_ancs_long$variable, mean), varnames="Source", value.name="Proportion")
cc_overall_summary = cc_overall_summary[order(-cc_overall_summary$Proportion),]
cc_overall_summary$Source = factor(cc_overall_summary$Source, levels=unique(as.character(cc_overall_summary$Source)))
cc_overall_plot = ggplot(cc_overall_summary) + geom_bar(aes(x=Source, y=Proportion), stat="identity", fill="black") + labs(x="C. coli", y="Proportion") + theme_grey(base_size = 14) + theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.title.x = element_text(face="italic"))

## Combined overall summary graph ##
overall_plot = plot_grid(cj_overall_plot, cc_overall_plot, labels=c("A", "B"), align="h")
# Save figure to output directory
ggsave("Figure1.svg", plot=overall_plot, device="svg", width=210, height=148, units="mm", dpi=300)

## Combined overall site-by-site summary plots ##
overall_sites_plot = plot_grid(cj_sbs_overall_plot, cc_sbs_overall_plot, labels=c("A", "B"), align="h")
ggsave("Figure2.svg", plot=overall_sites_plot, device="svg", width=210, height=148, units="mm", dpi=300)

## Combined overall summary table ##
overall_table = full_join(cj_overall_summary, cc_overall_summary, by = "Source")
colnames(overall_table) <- c("Source", "C. jejuni", "C. coli")
# Get significant figures
overall_table$`C. jejuni` = signif(overall_table$`C. jejuni`, 3)
overall_table$`C. coli` = signif(overall_table$`C. coli`, 3)

## Combined overall site-by-site summary table ##
sbs_table = merge(cj_sbs_summary, cc_sbs_summary, by.x="row.names", by.y="row.names", all=TRUE, suffixes = c(" (Cj)"," (Cc)"))
# Get significant figures
sbs_table[,-1] = signif(sbs_table[,-1], 3)
# Correct header
colnames(sbs_table)[1] <- "Sources"


##### INDIVIDUAL ANCESTRY PLOTS #####
### C. jejuni ###
# Retrieve sources dataframe
# Add "poprank" column that indicates the most likely source (in the case of a tie, 'which' will pick the first column)
cj_sources$poprank = colnames(cj_sources)[apply(cj_sources,1,which.max)]
# Add "max prob" column giving max value to order on later
cj_sources[, "maxprob"] = apply(within(cj_sources, rm(site, poprank)), 1, max)
# Reorder the dataframe from most to least common overall source, first by most likely source, and within each source 
cj_order = order(cj_sources$site, ordered(cj_sources$poprank, levels=cj_overall_summary$Source), -cj_sources$maxprob)
cj_sources = cj_sources[cj_order,]
# Add rank column after ordering to give order to ggplot
cj_sources = as.data.frame(cj_sources %>% group_by(site) %>% mutate(rank = row_number(site)))
# Melt data to long form
cj_sources_long = melt(cj_sources, id=c("site","poprank","maxprob","rank"), variable.name = "Source", value.name = "Proportion")

# Generate individual ancestries plots
cj_ind_ancs_plot = ggplot(cj_sources_long, aes(x=rank, y=Proportion, fill=Source, width=1)) + geom_bar(stat="identity") + labs(x="Human disease isolates", y="Source probability") + facet_grid(site ~ .) + set_fill_colours + guides(fill=guide_legend(title="C. jejuni")) + theme(legend.title = element_text(face = "italic"))

### Repeat for C. coli ###
cc_sources$poprank = colnames(cc_sources)[apply(cc_sources,1,which.max)]
cc_sources[, "maxprob"] = apply(within(cc_sources, rm(site, poprank)), 1, max)
cc_order = order(cc_sources$site, ordered(cc_sources$poprank, levels=cc_overall_summary$Source), -cc_sources$maxprob)
cc_sources = cc_sources[cc_order,]
cc_sources = as.data.frame(cc_sources %>% group_by(site) %>% mutate(rank = row_number(site)))
cc_sources_long = melt(cc_sources, id=c("site","poprank","maxprob","rank"), variable.name = "Source", value.name = "Proportion")

# Generate individual ancestries plots
cc_ind_ancs_plot = ggplot(cc_sources_long, aes(x=rank, y=Proportion, fill=Source, width=1)) + geom_bar(stat="identity") + labs(x="Human disease isolates", y="Source probability") + facet_grid(site ~ .) + set_fill_colours + guides(fill=guide_legend(title="C. coli")) + theme(legend.title = element_text(face = "italic"))

## Combined individual ancestry plots ##
overall_ind_ancs_plot = plot_grid(cj_ind_ancs_plot, cc_ind_ancs_plot, labels = c("A", "B"), ncol = 1)
# Save figure to output directory
ggsave("Figure3.svg", plot=overall_ind_ancs_plot, device="svg", width=210, height=148, units="mm", dpi=300)


##### ANNUAL BREAKDOWN #####
### Annual attribution summary for C. jejuni ###
# Add years column to date-complete dataset
cj_years = cj
cj_years$Year = format(cj_years$date,format="%Y")
# Drop unnecessary columns
cj_years <- cj_years %>%
    select(-matches('id')) %>%
    select(-matches('species'))%>%
    select(-matches('date'))
# Get averages by year
cj_years_mean = cj_years %>%
    group_by(site, Year) %>%
    dplyr::summarize_all(funs(mean))
# Melt data to long form
cj_years_mean_long = melt(cj_years_mean, id=c("site","Year"), variable.name = "Source", value.name = "Proportion")

# Re-order the dataframe so that bars will be stacked from major (bottom) to minor (top) sources
cj_years_order = order(ordered(cj_years_mean_long$Source, levels=cj_overall_summary$Source), decreasing = TRUE)
cj_years_mean_long = cj_years_mean_long[cj_years_order,]
cj_years_mean_long$Source = factor(cj_years_mean_long$Source, levels=unique(as.character(cj_years_mean_long$Source)))
# Force Year to be numeric so that graph will plot
cj_years_mean_long$Year = as.numeric(cj_years_mean_long$Year)

# Generate area plot and manipulate x-axis to display years correctly
cj_years_plot = ggplot(cj_years_mean_long, aes(x=Year, y=Proportion, fill=Source)) + geom_area() + scale_x_continuous(breaks=as.numeric(unique(cj_years_mean_long$Year)), labels=c(as.character(unique(cj_years_mean_long$Year)))) + labs(x="Time (years)", y="Proportion of isolates") + set_fill_colours + guides(fill=guide_legend(title="C. jejuni")) + theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.title = element_text(face = "italic")) + facet_grid(site ~ .) 

### Annual attribution summary for C. coli ###
cc_years = cc
cc_years$Year = format(cc_years$date,format="%Y")
cc_years <- cc_years %>%
    select(-matches('id')) %>%
    select(-matches('species'))%>%
    select(-matches('date'))
cc_years_mean = cc_years %>%
    group_by(site, Year) %>%
    dplyr::summarize_all(funs(mean))
cc_years_mean_long = melt(cc_years_mean, id=c("site","Year"), variable.name = "Source", value.name = "Proportion")
cc_years_order = order(ordered(cc_years_mean_long$Source, levels=cc_overall_summary$Source), decreasing = TRUE)
cc_years_mean_long = cc_years_mean_long[cc_years_order,]
cc_years_mean_long$Source = factor(cc_years_mean_long$Source, levels=unique(as.character(cc_years_mean_long$Source)))
cc_years_mean_long$Year = as.numeric(cc_years_mean_long$Year)

# Generate area plot and manipulate x-axis to display years correctly
cc_years_plot = ggplot(cc_years_mean_long, aes(x=Year, y=Proportion, fill=Source)) + geom_area() + scale_x_continuous(breaks=as.numeric(unique(cc_years_mean_long$Year)), labels=c(as.character(unique(cc_years_mean_long$Year)))) + labs(x="Time (years)", y="Proportion of isolates") + set_fill_colours + guides(fill=guide_legend(title="C. coli")) + theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.title = element_text(face = "italic")) + facet_grid(site ~ .)

## Combined annual attribution summary plot ##
yearly_plot = plot_grid(cj_years_plot, cc_years_plot,  labels=c("A", "B"), ncol = 1)
# Save figure to output directory
ggsave("Figure5.svg", plot=yearly_plot, device="svg", width=210, height=148, units="mm", dpi=300)

## Annual attribution tables for C. jejuni ##
# Reshape the long form table
cj_years_mean_table = dcast(setDT(cj_years_mean), Year ~ site, value.var=c(as.character(unique(cj_overall_summary$Source))))
# Shorten the suffixes for ease of reading
names(cj_years_mean_table) = gsub(x = names(cj_years_mean_table), pattern = "_Newcastle", replacement = " (NWC)")
names(cj_years_mean_table) = gsub(x = names(cj_years_mean_table), pattern = "_Oxfordshire", replacement = " (OXC)")
# Reorder columns so that sites are grouped
cj_years_col_order = order(grepl("^.*OXC.*$", colnames(cj_years_mean_table)))
cj_years_mean_table = cj_years_mean_table[, ..cj_years_col_order]

## Annual attribution tables for C. coli ##
cc_years_mean_table = dcast(setDT(cc_years_mean), Year ~ site, value.var=c(as.character(unique(cc_overall_summary$Source))))
# Shorten the suffixes for ease of reading
names(cc_years_mean_table) = gsub(x = names(cc_years_mean_table), pattern = "_Newcastle", replacement = " (NWC)")
names(cc_years_mean_table) = gsub(x = names(cc_years_mean_table), pattern = "_Oxfordshire", replacement = " (OXC)")
# Reorder columns so that sites are grouped
cc_years_col_order = order(grepl("^.*OXC.*$", colnames(cc_years_mean_table)))
cc_years_mean_table = cc_years_mean_table[, ..cc_years_col_order]

### Annual counts for C. jejuni ###
cj_years_count = cj_years %>%
    group_by(site, Year) %>%
    summarise(total.count=n())
# Add species column and rename columns
cj_years_count$Species = "C. jejuni"
colnames(cj_years_count) <- c("Site", "Year", "Count", "Species")

### Annual counts for C. coli ###
cc_years_count = cc_years %>%
    group_by(site, Year) %>%
    summarise(total.count=n())
cc_years_count$Species = "C. coli"
colnames(cc_years_count) <- c("Site", "Year", "Count", "Species")

## Combined annual counts plot ##
# Join C .jejuni and C. coli annual counts for plotting
overall_years_count = union(cj_years_count, cc_years_count)
# Convert years to integers for graphing
overall_years_count$Year = as.integer(overall_years_count$Year)

# Generate annual counts plot
overall_years_count_plot = ggplot(data=overall_years_count, aes(x = Year, y = Count, group = Site)) + geom_line(aes(linetype=Site)) + scale_x_continuous(breaks=as.numeric(unique(overall_years_count$Year)), labels=c(as.character(unique(overall_years_count$Year))), expand=c(0,0)) + labs(x="Time (years)", y="Number of isolates") + theme_grey(base_size = 14) + theme(axis.text.x = element_text(angle = 45, hjust = 1), strip.text = element_text(face = "italic")) + facet_grid(Species ~ ., scales = "free")

## Combined annual counts table ##
cj_years_count_report <- as.data.frame(cj_years_count) %>%
	select(-matches('Species'))
cj_years_count_report = dcast(cj_years_count_report, Year ~ Site, value.var="Count")
cc_years_count_report <- as.data.frame(cc_years_count) %>%
	select(-matches('Species'))
cc_years_count_report = dcast(cc_years_count_report, Year ~ Site, value.var="Count")
overall_years_count_table = merge(cj_years_count_report, cc_years_count_report, by.x="Year", by.y="Year", all=TRUE, suffixes = c(" (Cj)"," (Cc)"))


##### QUARTERLY BREAKDOWN #####
# Define functions for plotting quarterly data
# Code adapted from https://stackoverflow.com/questions/31198144/formatting-a-scale-x-continuous-axis-with-quarterly-data
# Convert quarter to uniform date
make_date <- function(x) {
  year <- floor(x)
  x <- year + (x - year)/0.4 - 0.125
  as.Date(as.yearqtr(x))
}
# Set up formatting for x-axis labels
format_quarters <- function(x) {
  x <- as.yearqtr(x)
  year <- as.integer(x)
  quart <- as.integer(format(x, "%q"))

  paste(c("Jan-Mar","Apr-Jun","Jul-Sep","Oct-Dec")[quart], 
        year)
}

### Quarterly attribution summary for C. jejuni ###
# Add quarters column to date-complete dataset
cj_quarters = cj
cj_quarters$Quarter = as.yearqtr(cj_quarters$date, format = "%Y-%m-%d")
# Coerce quarter into graphable format
cj_quarters$Quarter = format(cj_quarters$Quarter, format = "%Y.%q")
# Drop unnecessary columns
cj_quarters <- cj_quarters %>%
    select(-matches('id')) %>%
    select(-matches('species')) %>%
    select(-matches('date'))
# Get mean by quarter over time
cj_quarters_mean = cj_quarters %>%
    group_by(site, Quarter) %>%
    dplyr::summarize_all(funs(mean))
# Convert to long form
cj_quarters_mean_long = melt(cj_quarters_mean, id = c("site", "Quarter"), variable.name = "Source", value.name = "Proportion")

# Re-order
cj_quarters_order = order(ordered(cj_quarters_mean_long$Source, levels=cj_overall_summary$Source), decreasing = TRUE)
cj_quarters_mean_long = cj_quarters_mean_long[cj_quarters_order,]
cj_quarters_mean_long$Source = factor(cj_quarters_mean_long$Source, levels=unique(as.character(cj_quarters_mean_long$Source)))

# Generate area plot
cj_quarters_plot = ggplot(cj_quarters_mean_long, aes(x=make_date(as.numeric(Quarter)), y=Proportion, fill=Source)) + geom_area() + set_fill_colours + labs(x="Time (years and calendar quarters)", y="Proportion of isolates") + guides(fill=guide_legend(title="C. jejuni")) + scale_x_date(breaks=date_breaks("3 months"), expand=c(0,0), labels=format_quarters) + theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.title = element_text(face = "italic")) + facet_grid(site ~ .)

### Quarterly attribution summary for C. coli ###
cc_quarters = cc
cc_quarters$Quarter = as.yearqtr(cc_quarters$date, format = "%Y-%m-%d")
cc_quarters$Quarter = format(cc_quarters$Quarter, format = "%Y.%q")
cc_quarters <- cc_quarters %>%
    select(-matches('id')) %>%
    select(-matches('species')) %>%
    select(-matches('date'))
cc_quarters_mean = cc_quarters %>%
    group_by(site, Quarter) %>%
    dplyr::summarize_all(funs(mean))
cc_quarters_mean_long = melt(cc_quarters_mean, id = c("site", "Quarter"), variable.name = "Source", value.name = "Proportion")
cc_quarters_order = order(ordered(cc_quarters_mean_long$Source, levels=cc_overall_summary$Source), decreasing = TRUE)
cc_quarters_mean_long = cc_quarters_mean_long[cc_quarters_order,]
cc_quarters_mean_long$Source = factor(cc_quarters_mean_long$Source, levels=unique(as.character(cc_quarters_mean_long$Source)))

# Generate area plot
cc_quarters_plot = ggplot(cc_quarters_mean_long, aes(x=make_date(as.numeric(Quarter)), y=Proportion, fill=Source)) + geom_area() + set_fill_colours + labs(x="Time (years and calendar quarters)", y="Proportion of isolates") + guides(fill=guide_legend(title="C. coli")) + scale_x_date(breaks=date_breaks("3 months"), expand=c(0,0), labels=format_quarters) + theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.title = element_text(face = "italic")) + facet_grid(site ~ .)

## Combined quarterly attribution plot ##
quarterly_plot = plot_grid(cj_quarters_plot, cc_quarters_plot, labels=c("A", "B"), nrow = 2, align = "v")
# Save figure to output directory
ggsave("Figure6.svg", plot=quarterly_plot, device="svg", width=210, height=148, units="mm", dpi=300)

### Quarterly attribution table for C. jejuni ###
# Reshape the long form table
cj_quarters_mean_table = dcast(setDT(cj_quarters_mean), Quarter ~ site, value.var=c(as.character(unique(cj_overall_summary$Source))))
# Shorten the suffixes for ease of reading
names(cj_quarters_mean_table) = gsub(x = names(cj_quarters_mean_table), pattern = "_Newcastle", replacement = " (NWC)")
names(cj_quarters_mean_table) = gsub(x = names(cj_quarters_mean_table), pattern = "_Oxfordshire", replacement = " (OXC)")
# Reorder columns so that sites are grouped
cj_quarters_col_order = order(grepl("^.*OXC.*$", colnames(cj_quarters_mean_table)))
cj_quarters_mean_table = cj_quarters_mean_table[, ..cj_quarters_col_order]

### Quarterly attribution table for C. coli ###
cc_quarters_mean_table = dcast(setDT(cc_quarters_mean), Quarter ~ site, value.var=c(as.character(unique(cc_overall_summary$Source))))
names(cc_quarters_mean_table) = gsub(x = names(cc_quarters_mean_table), pattern = "_Newcastle", replacement = " (NWC)")
names(cc_quarters_mean_table) = gsub(x = names(cc_quarters_mean_table), pattern = "_Oxfordshire", replacement = " (OXC)")
cc_quarters_col_order = order(grepl("^.*OXC.*$", colnames(cc_quarters_mean_table)))
cc_quarters_mean_table = cc_quarters_mean_table[, ..cc_quarters_col_order]

### Quarterly counts for C. jejuni ###
cj_quarters_count = cj_quarters %>%
    group_by(site, Quarter) %>%
    summarise(total.count=n())
# Add species column and rename columns
cj_quarters_count$Species = "C. jejuni"
colnames(cj_quarters_count) <- c("Site", "Quarter", "Count", "Species")

### Quarterly counts for C. coli ###
cc_quarters_count = cc_quarters %>%
    group_by(site, Quarter) %>%
    summarise(total.count=n())
cc_quarters_count$Species = "C. coli"
colnames(cc_quarters_count) <- c("Site", "Quarter", "Count", "Species")

# Combined quarterly counts for plotting
overall_quarters_count = union(cj_quarters_count, cc_quarters_count)

# Generate quarterly counts plot
overall_quarters_count_plot = ggplot(data=overall_quarters_count, aes(x = make_date(as.numeric(Quarter)), y = Count, group = Site)) + geom_line(aes(linetype=Site)) + labs(x="Time (years and calendar quarters)", y="Number of isolates") + scale_x_date(breaks=date_breaks("3 months"), expand=c(0,0), labels=format_quarters) + theme_grey(base_size = 14) + theme(axis.text.x = element_text(angle = 45, hjust = 1), strip.text = element_text(face = "italic")) + facet_grid(Species ~ ., scales = "free")

## Combined annual and quarterly counts plot ##
overall_counts_combined = plot_grid(overall_years_count_plot, overall_quarters_count_plot, labels=c("A", "B"), nrow = 2, align = "v")
# Save plot
ggsave("Figure4.svg", plot=overall_counts_combined, device="svg", width=210, height=148, units="mm", dpi=300)

## Combined quarterly counts table ##
cj_quarters_count_report <- as.data.frame(cj_quarters_count) %>%
	select(-matches('Species'))
cj_quarters_count_report = dcast(cj_quarters_count_report, Quarter ~ Site, value.var="Count")
cc_quarters_count_report <- as.data.frame(cc_quarters_count) %>%
	select(-matches('Species'))
cc_quarters_count_report = dcast(cc_quarters_count_report, Quarter ~ Site, value.var="Count")
overall_quarters_count_table = merge(cj_quarters_count_report, cc_quarters_count_report, by.x="Quarter", by.y="Quarter", all=TRUE, suffixes = c(" (Cj)"," (Cc)"))

###############################################################################
##### REPORT GENERATION #####
###############################################################################


##### CREATE DOCUMENT AND TITLE #####
doc = docx(title="FSA_Report_Skeleton")
doc = addTitle(doc , 'FS101013: Source attribution of campylobacteriosis isolates from Newcastle/North Tyneside and Oxfordshire', level=1)


##### INTRODUCTION #####
doc = addTitle(doc, 'Introduction', level=2)
doc = addParagraph(doc, 'This is an automated report summarizing the population genetic assignment of Campylobacter isolates from human disease cases in Newcastle/North Tyneside and Oxfordshire, UK. Isolates were assigned to putative sources based on population genetic analysis using STRUCTURE (1) or iSource (2) software, as selected by the user. The reporting system was developed using funding from the Food Standards Agency (FS101013) and the NIHR Health Protection Research Unit in Gastrointestinal Infections.', stylename = 'Normal')


##### METHODS #####
doc = addTitle(doc , 'Methods', level=2)
### Describe datasets ###
doc = addTitle(doc, 'Datasets', level=3)
doc = addParagraph(doc, 'Attribution analyses were carried out using data stored in the PubMLST database (https://pubmlst.org/ campylobacter/) (3).', stylename = 'Normal')

## Reference sets ##
doc = addTitle(doc, 'Reference isolates', level=4)
doc = addParagraph(doc, 'Validated C. jejuni (n = 7715) and C. coli (n = 3521) reference sets were exported from the PubMLST database in July 2017 and remain unchanged (Jansen van Rensburg et al., In preparation).', stylename = 'Normal')

## Human disease sets ##
doc = addTitle(doc, 'Human disease isolates', level=4)
# Print simpler statement if no missing data detected
if (((cj_nwc_missing_dates + cj_oxc_missing_dates)==0)&((cc_nwc_missing_dates + cc_oxc_missing_dates)==0)) {
doc = addParagraph(doc, sprintf('A total of %s C. jejuni and %s C. coli isolates, collected between %s and %s in Newcastle/North Tyneside and Oxfordshire, were attributed to animal and/or environmental sources.', no_ancs_cj, no_ancs_cc, min_date, max_date), stylename = 'Normal')
} else {
doc = addParagraph(doc, sprintf('A total of %s C. jejuni and %s C. coli isolates, collected between %s and %s in Newcastle/North Tyneside and Oxfordshire, were attributed to animal and/or environmental sources; however, isolates without dates of isolation/laboratory receipt dates were excluded from all date-based analyses presented in this report. Following data cleaning, %s and %s C. jejuni from Newcastle/North Tyneside and Oxfordshire, respectively, were excluded, as were %s and %s C. coli from Newcastle/North Tyneside and Oxfordshire.', no_ancs_cj, no_ancs_cc, min_date, max_date, cj_nwc_missing_dates, cj_oxc_missing_dates, cc_nwc_missing_dates, cc_oxc_missing_dates), stylename = 'Normal')
}

### Describe attribution ###
doc = addTitle(doc, 'Source attribution', level=3)
doc = addParagraph(doc, 'Isolates were assigned to putative host sources using STRUCTURE or iSource, based on analysis of MLST data. The algorithms were run separately for C. jejuni and C. coli. For STRUCTURE analyses, the no-admixture model was used and the program was run using a burn-in period of 1,000 cycles followed by 10,000 iterations. For iSource, the Asymmetric Island model was used and the program was run for 10,000 iterations without thinning, using a symmetric Dirichlet prior.', stylename = 'Normal')
doc = addParagraph(doc, 'The probabilities of attribution to each source are summed as an arithmetic mean across the cases of human infection (Figures 1 and 2), presented for individual isolates (Figure 3), and summed over time (Figure 5 onwards).', stylename = 'Normal')
doc = addParagraph(doc, 'The attribution carries the assumption that all isolates came from one of the sources in the analysis. Isolates from sources not represented will be assigned to sources present in the reference sets according to their genetic similarity. The results presented do not include any adjustment for bias that may occur in population genetic attribution with the included reference isolates. Results should be considered in the light of the results of the reference set validation study (Cody et al., In preparation; Jansen van Rensburg et al., In preparation).', stylename = 'Normal')

doc <- addPageBreak(doc)


##### RESULTS #####
doc = addTitle(doc, 'Results', level=2)

### Overall summary ###
doc = addTitle(doc, 'Overall summary', level=3)
doc = addParagraph(doc, 'Figures 1 and 2 show the overall and site-by-site proportions attributed to each source in the reference set, respectively. Figure 3 shows the probabilistic assignments of individual isolates. Tabulated data for Figures 1 and 2 are provided in the Appendices.', stylename = 'Normal')
doc = addParagraph(doc, 'Tabulated data for Figures 1 and 2 are provided in the Appendices.', stylename = 'Normal')

# Overall summary plot
doc = addPlot(doc , fun=print, x=overall_plot, width=6, height=4)
doc = addParagraph(doc, sprintf('Figure 1. Estimated proportion of human disease isolates attributed to putative sources. Probabilistic assignment of (A) %s C. jejuni and (B) %s C. coli human disease isolates collected between %s and %s in Newcastle/North Tyneside and Oxfordshire.', no_ancs_cj, no_ancs_cc, min_date, max_date), stylename='Normal')

# Overall site-by-site plot
doc = addPlot(doc, fun=print, x=overall_sites_plot, width=6, height=4)
doc = addParagraph(doc, sprintf('Figure 2. Estimated proportion of human disease isolates attributed to putative sources. Probabilistic assignment of (A) C. jejuni collected in Newcastle/North Tyneside (n = %s) and Oxfordshire (n = %s) between %s and %s, and (B) C. coli collected in Newcastle/North Tyneside (n = %s) and Oxfordshire (n = %s) between %s and %s.', all_cj_nwc, all_cj_oxc, min_date_cj, max_date_cj, all_cc_nwc, all_cc_oxc, min_date_cc, max_date_cc), stylename='Normal')

# Overall individual ancestries plot
doc = addPlot(doc , fun=print, x=overall_ind_ancs_plot, width=6.5, height=6)
doc = addParagraph(doc, sprintf('Figure 3. Source probabilities for individual human disease isolates. Probabilistic assignment of (A) C. jejuni isolates from Newcastle/North Tyneside (n = %s) and Oxfordshire (n = %s), and (B) C. coli isolates from Newcastle/North Tyneside (n = %s) and Oxfordshire (n = %s). Isolates are represented as vertical bars coloured according to the estimated probability for each source, as shown in the legends. Isolates are ordered horizontally to aid visualisation, first by most likely source and then by decreasing probability within each source.', all_cj_nwc, all_cj_oxc, all_cc_nwc, all_cc_oxc), stylename='Normal')

doc <- addPageBreak(doc)

### Isolate counts ###
doc = addTitle(doc, 'Breakdown of isolates over time', level=3)
doc = addParagraph(doc, 'Tabulated data are provided in the Appendices.', stylename = 'Normal')
# Isolate count over years and quarters plot
doc = addPlot(doc , fun=print, x=overall_counts_combined, width=6.5, height=7)
doc = addParagraph(doc, 'Figure 4. Number of isolates collected in Newcastle/North Tyneside and Oxfordshire per year (A), and per year and calendar quarter (B).', stylename='Normal')

doc <- addPageBreak(doc)

### Annual breakdown ###
doc = addTitle(doc, 'Annual attribution breakdown', level=3)
doc = addParagraph(doc, 'Tabulated data are provided in the Appendices.', stylename = 'Normal')

# Annual attribution plot
doc = addPlot(doc , fun=print, x=yearly_plot, width=6.5, height=6.5)
doc = addParagraph(doc, sprintf('Figure 5. Estimated proportion of human disease isolates attributed to putative sources per year. Probabilistic assignment of (A) C. jejuni isolates from Newcastle/North Tyneside (n = %s) and Oxfordshire (n = %s), and (B) C. coli isolates from Newcastle/North Tyneside (n = %s) and Oxfordshire (n = %s). Bars are ordered from major (bottom) to minor (top) sources based on the overall proportions shown in Figure 1.', dated_cj_nwc, dated_cj_oxc, dated_cc_nwc, dated_cc_oxc), stylename='Normal')

doc <- addPageBreak(doc)

### Quarterly breakdown ###
doc = addTitle(doc, 'Quarterly breakdown', level=3)
doc = addParagraph(doc, 'Tabulated data are provided in the Appendices.', stylename = 'Normal')

# Quarterly attribution plot
doc = addPlot(doc , fun=print, x=quarterly_plot, width=6.5, height=7.5)
doc = addParagraph(doc, sprintf('Figure 6. Estimated proportion of human disease isolates attributed to putative sources per calendar quarter. Probabilistic assignment of (A) C. jejuni isolates from Newcastle/North Tyneside (n = %s) and Oxfordshire (n = %s), and (B) C. coli isolates from Newcastle/North Tyneside (n = %s) and Oxfordshire (n = %s). Bars are ordered from major (bottom) to minor (top) sources based on the overall proportions shown in Figure 1.', dated_cj_nwc, dated_cj_oxc, dated_cc_nwc, dated_cc_oxc), stylename='Normal')

doc <- addPageBreak(doc)


##### REFERENCES #####
doc = addTitle(doc, 'References', level=2)
doc = addParagraph(doc, '1. Pritchard, J.K., Stephens, M. and Donnelly, P., 2000. Inference of population structure using multilocus genotype data. Genetics, 155(2), pp.945-959.', stylename = 'Normal')
doc = addParagraph(doc, '2. Wilson, D.J., Gabriel, E., Leatherbarrow, A.J., Cheesbrough, J., Gee, S., Bolton, E., Fox, A., Fearnhead, P., Hart, C.A. and Diggle, P.J., 2008. Tracing the source of campylobacteriosis. PLoS Genetics, 4(9), p.e1000203.', stylename = 'Normal')
doc = addParagraph(doc, '3. Jolley, K.A. and Maiden, M.C., 2010. BIGSdb: scalable analysis of bacterial genome variation at the population level. BMC bioinformatics, 11(1), p.595.', stylename = 'Normal')

doc <- addPageBreak(doc)


##### APPENDICES #####
doc = addTitle(doc, 'Appendices', level=2)

### Overall summary ###
# Overall summary table
doc = addTitle(doc, 'Overall summary', level=3)
doc = addParagraph(doc, sprintf('Table A1. Attribution to putative sources of %s C. jejuni and %s C. coli human disease isolates collected between %s and %s in Newcastle/North Tyneside and Oxfordshire.', no_cj, no_cc, min_date, max_date), stylename='Normal')
doc = addFlexTable(doc, vanilla.table(overall_table))
doc = addParagraph(doc, '\r\n', stylename=)

# Overall site-by-site table
doc = addParagraph(doc, sprintf('Table A2. Attribution to putative sources of C. jejuni (Cj) and C. coli (Cc) human disease isolates from Newcastle/North Tyneside (n = %s; n = %s) and Oxfordshire (n = %s; n = %s).', all_cj_nwc, all_cc_nwc, all_cj_oxc, all_cc_oxc), stylename='Normal')
doc = addFlexTable(doc, vanilla.table(sbs_table))
doc = addParagraph(doc, '\r\n', stylename=)

doc <- addPageBreak(doc)

### Isolate counts ###
doc = addTitle(doc, 'Breakdown of isolates over time', level=3)
# Breakdown of number of isolates per year
doc = addParagraph(doc, sprintf('Table A3. Number of C. jejuni (Cj) and C. coli (Cc) human disease isolates collected in Newcastle/North Tyneside and Oxfordshire per year between %s and %s.', min_date, max_date), stylename='Normal')
doc = addFlexTable(doc, vanilla.table(overall_years_count_table))
doc = addParagraph(doc, '\r\n', stylename=)

# Breakdown of number of isolates per quarter
# Convert quarters into readable format that matches plots
overall_quarters_count_table$Quarter = format_quarters(make_date(as.numeric(overall_quarters_count_table$Quarter)))
doc = addParagraph(doc, 'Table A4. Number of C. jejuni (Cj) and C. coli (Cc) human disease isolates per quarter', stylename='Normal')
doc = addFlexTable(doc, vanilla.table(overall_quarters_count_table))
doc = addParagraph(doc, '\r\n', stylename=)

doc <- addPageBreak(doc)

### Annual breakdown ###
doc = addTitle(doc, 'Annual breakdown', level=3)
# Breakdown of attribution per year
# C. jejuni
# Get significant figures
cj_years_mean_table <- cj_years_mean_table %>%
	mutate_if(is.numeric, funs(signif(., 3)))
# Display table
doc = addParagraph(doc, sprintf('Table A5. Proportion of C. jejuni isolates from Newcastle/North Tyneside (NWC) (n = %s) and Oxfordshire (OXC) (n = %s) attributed to putative sources per year between %s and %s.', dated_cj_nwc, dated_cj_oxc, min_date_cj, max_date_cj), stylename='Normal')
doc = addFlexTable(doc, vanilla.table(cj_years_mean_table))
doc = addParagraph(doc, '\r\n', stylename=)

# C. coli
# Get significant figures
cc_years_mean_table <- cc_years_mean_table %>%
	mutate_if(is.numeric, funs(signif(., 3)))
# Display table
doc = addParagraph(doc, sprintf('Table A6. Proportion of C. coli isolates from Newcastle/North Tyneside (NWC) (n = %s) and Oxfordshire (OXC) (n = %s) attributed to putative sources per year between %s and %s.', dated_cc_nwc, dated_cc_oxc, min_date_cc, max_date_cc), stylename='Normal')
doc = addFlexTable(doc, vanilla.table(cc_years_mean_table))
doc = addParagraph(doc, '\r\n', stylename=)

doc <- addPageBreak(doc)

### Quarterly breakdown ###
doc = addTitle(doc, 'Quarterly breakdown', level=3)
# Breakdown of attribution per quarter
# C. jejuni
# Get significant figures
cj_quarters_mean_table <- cj_quarters_mean_table %>%
	mutate_if(is.numeric, funs(signif(., 3)))
# Convert quarters into readable format that matches plots
cj_quarters_mean_table$Quarter = format_quarters(make_date(as.numeric(cj_quarters_mean_table$Quarter)))
# Display table
doc = addParagraph(doc, sprintf('Table A7. Proportion of C. jejuni isolates from Newcastle/North Tyneside (NWC) (n = %s) and Oxfordshire (OXC) (n = %s) attributed to putative sources per calendar quarter between %s and %s.', dated_cj_nwc, dated_cj_oxc, min_date_cj, max_date_cj), stylename='Normal')
doc = addFlexTable(doc, vanilla.table(cj_quarters_mean_table))
doc = addParagraph(doc, '\r\n', stylename=)

# C. coli
# Get significant figures
cc_quarters_mean_table <- cc_quarters_mean_table %>%
	mutate_if(is.numeric, funs(signif(., 3)))
# Convert quarters into readable format that matches plots
cc_quarters_mean_table$Quarter = format_quarters(make_date(as.numeric(cc_quarters_mean_table$Quarter)))
# Display table
doc = addParagraph(doc, sprintf('Table A8. Proportion of C. coli isolates from Newcastle/North Tyneside (NWC) (n = %s) and Oxfordshire (OXC) (n = %s) attributed to putative sources per calendar quarter between %s and %s.', dated_cc_nwc, dated_cc_oxc, min_date_cc, max_date_cc), stylename='Normal')
doc = addFlexTable(doc, vanilla.table(cc_quarters_mean_table))
doc = addParagraph(doc, '\r\n', stylename=)

writeDoc(doc, "FSA_Report_Skeleton.docx")