MultiSTEP_analysis.Rmd

---
title: "MultiSTEP FIX project analysis"
author: "Nicholas Popp"
date: "`r Sys.Date()`"
output: pdf_document
---

```{r setup, include = FALSE}
knitr::opts_chunk$set(echo = TRUE)
options(readr.show_col_types = FALSE)
```

```{r install and load libraries}

## install and load required packages

## knitr 1.41 for making files
if (!require(knitr)) install.packages('knitr')
library(knitr)

## here 1.2.0 for directory management
if (!require(here)) install.packages('here')
library(here)

## broom 1.2.0 for tidy fitting
if (!require(broom)) install.packages('broom')
library(broom)

## furrr for parallelizing
if (!require(furrr)) install.packages('furrr')
library(furrr)

## hash for creating hash table
if (!require(hash)) install.packages('hash')
library(hash)

## scales 1.1.1 for scientific notation
if (!require(scales)) install.packages('scales')
library(scales)

## tidyverse 1.3.1 for ggplot, dplyr, data manipulation
## absolutely required
if (!require(tidyverse)) install.packages('tidyverse')
library(tidyverse)

## paletteer 1.4.0 for color palettes
if (!require(paletteer)) install.packages('paletteer')
library(paletteer)

## ggpubr 0.4.0 for correlation stats
if (!require(ggpubr)) install.packages('ggpubr')
library(ggpubr)

## ggrastr 1.0.0 for reducing plot size when >1000 points
if (!require(ggrastr)) install.packages('ggrastr')
library(ggrastr)

## janitor 2.1.2 for cleaning data
if (!require(janitor)) install.packages('janitor')
library(janitor)

## ggridges 0.5.4 for stacked density plots
if (!require(ggridges)) install.packages('ggridges')
library(ggridges)

## patchwork 1.1.1 for aligning multi-panel plots
if (!require(patchwork)) install.packages('patchwork')
library(patchwork)

## colorspace 2.1-0 for color manipulation
if (!require(colorspace)) install.packages('colorspace')
library(colorspace)

## ggrepel 0.9.1 to label points easily
if (!require(ggrepel)) install.packages('ggrepel')
library(ggrepel)

## tidymodels 1.0.0 for classification models
if (!require(tidymodels)) install.packages('tidymodels')
library(tidymodels)

## themis 1.0.0 for dealing with unbalanced class data
if (!require(themis)) install.packages('themis')
library(themis)

## ggpattern 1.0.1 for striped fills
if (!require(ggpattern)) install.packages('ggpattern')
library(ggpattern)

## ggalluvial 1.1.1 for alluvial plots
if (!require(ggalluvial)) install.packages('ggalluvial')
library(ggalluvial)

## devtools 1.1.1 for installing from github
if (!require(devtools)) install.packages('devtools')
library(devtools)

## ggsankey 0.0.99999 for sankey plots
if (!require(ggsankey)) install.github('davidsjoberg/ggsankey')
library(ggsankey)

## ggpubr 0.5.0 for statistics
if (!require(ggpubr)) install.packages('ggpubr')
library(ggpubr)

## mcp 0.3.4 for changepoint analysis
## requires download of JAGS to work (https://sourceforge.net/projects/mcmc-jags/)
if (!require(mcp)) install.packages('mcp')
library(mcp)

## HDInterval 0.2.4 for changepoint analysis
if (!require(HDInterval)) install.packages('HDInterval')
library(HDInterval)

```

```{r Rmd file setup}

## make sure working directories are correct
## the analysis file should be one directory above all input files
i_am("MultiSTEP_analysis.Rmd")

###############################################################################

## set seed for reproducible plots (randomization)
set.seed(627)

```

```{r plot setup}

## set plot theming, including font size to 6pt
theme_set(theme_bw(base_size = 6) +
            theme(text = element_text(size = 6, color = "black"),
                  strip.background = element_blank(),
                  strip.text = element_text(size = 6, color = "black"),
                  plot.tag = element_text(size = 6, color = "black"),
                  plot.caption = element_text(size = 6, color = "black"),
                  plot.title = element_text(size = 6, hjust = 0.5, vjust = 1, color = "black"),
                  legend.text = element_text(size = 6, color = "black"),
                  axis.text = element_text(size = 6, color = "black"),
                  axis.ticks = element_line(color = "black"),
                  panel.border = element_rect(color = "black"),
                  panel.grid.major = element_blank(),
                  panel.grid.minor = element_blank(),
                  legend.position = "right",
                  legend.justification = "center"))

```

```{r function setup}

## create beautiful heatmaps
heatmap_plot <- function(...) {
  ggplot(data = ...,
         aes(x = position,
             y = fct_rev(var_aa),
             fill = average_score2)) +
    ## setup to use color to plot missing data
    geom_tile(data = . %>% 
                filter(is.na(average_score2)),
              aes(color = ""),
              fill = "grey50") +
    ## plot non-NA data with grey stroke
    geom_tile(data = . %>%
                filter(is_wt != "WT"),
              color = "grey20") +
    ## fill in WT positions white (WT score, by definition)
    geom_tile(data = . %>%
                filter(is_wt == "WT"),
              fill = "white", color = "grey20") +
    ## identify WT with point in tile
    geom_point(data = . %>%
                 filter(is_wt == "WT"),
               aes(shape = ""),
               size = 0.2, color = "black") +
    ## adjust fill colors (blue = low, red = high, white = WT)
    scale_fill_distiller(palette = "RdBu", direction = -1,
                         limits = c(0, 2),
                         labels = c("0", "0.5", "1", "1.5", "2")) +
    scale_color_manual(values = NA) +
    ## scale y axis to remove excess space
    scale_y_discrete(expand = c(0, 0),
                     guide = guide_axis(n.dodge = 2)) +
    ## add labels
    labs(x = "Position",
         y = "Substituted amino acid") +
    ## adjust plot features
    theme(panel.border = element_rect(fill = NA, color = "black"),
          axis.text.y = element_text(hjust = 0.5))
}

###############################################################################

## function to read out all matching Clinvar files in subdirectories
read_clinvar_path <- function(path){
  
  ## read in tsv file
  read_tsv(path, col_names = TRUE,
           col_select = c("Type", "Name", "#AlleleID", "ReviewStatus",
                          "ClinicalSignificance", "GeneSymbol")) %>%
    ## create path name variable
    mutate(source_path = path) %>%
    ## convert path to year
    extract(col = source_path,
            into = "year",
            regex = ".*_([0-9]+)-.*")
  
}

###############################################################################
  
## function to read out all matching Clinvar files in subdirectories
read_flow_path <- function(path){
  
  ## read in csv file
  read_csv(path, col_names = TRUE) %>%
    ## create path name variable
    mutate(source_path = path)

}

###############################################################################

## function to compare DNA and protein strings to identify variants
## use in conjunction with rowwise() and mutate() on a dataframe of sequences
## to call variants from sequence
variant_caller <- function(nt1, nt2) {
  
  ## create temp length count
  temp_length <- str_length(nt1) - str_length(nt2)

  ## count insertions
  if (temp_length < 0) {
    
    ## take absolute value of length to prevent negative numbers in table
    temp_length <- abs(temp_length)
    
    ## output insertions with nt length
    diff_aa <- paste0("insertion: length ", temp_length, " nt")
    diff_nt <- paste0("insertion: length ", temp_length, " nt")
    diff_codon <- paste0("insertion: length ", temp_length, " nt")
    
  ## count deletions, not including 3 nt deletions
  } else if (temp_length > 0 & temp_length != 3) {
    
    ## output deletions with nt length
    diff_aa <- paste0("deletion: length ", temp_length, " nt")
    diff_nt <- paste0("deletion: length ", temp_length, " nt")
    diff_codon <- paste0("deletion: length ", temp_length, " nt")

  ## correct length and codon deletion analysis
  } else {
    
    ## turn strings into vector of NT
    nt1vec <- unlist(strsplit(nt1, ""))
    nt2vec <- unlist(strsplit(nt2, ""))
  
    ## combine strings into 3 NT codons
    nt1codvec <- paste0(nt1vec[c(TRUE, FALSE, FALSE)],
                        nt1vec[c(FALSE, TRUE, FALSE)],
                        nt1vec[c(FALSE, FALSE, TRUE)])
    nt2codvec <- paste0(nt2vec[c(TRUE, FALSE, FALSE)],
                        nt2vec[c(FALSE, TRUE, FALSE)],
                        nt2vec[c(FALSE, FALSE, TRUE)])
    
    ## convert WT to aa strings
    aa1vec <- unlist(mget(nt1codvec, hash_codon_table@.xData))
    aa2vec <- unlist(mget(nt2codvec, hash_codon_table@.xData))
    
    ## deletion analysis to find position
    if (temp_length == 3){
      
      ## append NNN to end to create same length vector
      nt2codvec_temp <- append(nt2codvec, "NNN")
      
      ## compare as vector, find first missing position
      missing_pos <- (1:length(nt1codvec))[nt1codvec != nt2codvec_temp][1]
      
      ## add "NNN" at first missing position and re-compare
      ## append adds after position, so have to subtract 1
      nt2codvec_NNN <- append(nt2codvec, "NNN", after = missing_pos - 1)
      
      ## find new missing first position
      new_missing_pos <- (1:length(nt1codvec))[nt1codvec != nt2codvec_NNN]
      
      ## compare missing positions
      match_missing <- missing_pos == new_missing_pos
      
      ## check for codon deletions + substitutions
      if (length(match_missing) > 1){
        
        ## assign values for complex deletions
        diff_aa <- paste0("complex deletion: length ", temp_length, " nt")
        diff_nt <- paste0("complex deletion: length ", temp_length, " nt")
        diff_codon <- paste0("complex deletion: length ", temp_length, " nt")
        
      ## if codon deletion is in frame, will only be same position and return TRUE
      ## if FALSE, deletion is out of frame
      } else if (match_missing == TRUE) {
        
        ## create list of different amino acids, collapse with , separator
        diff_aa <- paste0(aa1vec[new_missing_pos], new_missing_pos, "del",
                          collapse = ", ")
        
        ## create list of different nucleotides, collapse with , separator
        diff_nt <- paste0("codon deletion: length ", temp_length, " nt")
        
        ## create list of different codons, collapse with , separator
        diff_codon <- paste0(nt1codvec[new_missing_pos], new_missing_pos, "del")
        
      } else {
        
        ## report deletion with nt length
        diff_aa <- paste0("frameshift deletion: length ", temp_length, " nt")
        diff_nt <- paste0("frameshift deletion: length ", temp_length, " nt")
        diff_codon <- paste0("frameshift deletion: length ", temp_length, " nt")
        
      }
      
    } else {
      
      ## compare nucleotides and amino acids
      var_nt <- (1:length(nt1vec))[nt1vec != nt2vec]
      var_aa <- (1:length(aa1vec))[aa1vec != aa2vec]
      var_codon <- (1:length(nt1codvec))[nt1codvec != nt2codvec]
  
      ## if no difference in length of amino acids or nucleotides, label WT
      if (length(var_nt) == 0) {
        
        ## label as WT
        diff_aa <- "WT"
        diff_nt <- "WT"
        diff_codon <- "WT"
    
      ## else, make list of variants at nucleotide and amino acid level
      } else {
      
        ## divide nt position by 3 to make aa position
        ## use ceiling to create a round number and prevent dividing errors
        ceiling_nt <- ceiling(var_nt / 3)

        ## only retain unique aa position values
        ceiling_nt <- unique(ceiling_nt)

        ## create list of different amino acids, collapse with , separator
        diff_aa <- paste0(aa1vec[ceiling_nt], ceiling_nt, aa2vec[ceiling_nt],
                          collapse = ", ")
      
        ## create list of different nucleotides, collapse with , separator
        diff_nt <- paste0(nt1vec[var_nt], var_nt, nt2vec[var_nt],
                          collapse = ", ")
        
        ## create list of different codons
        diff_codon <- paste0(nt1codvec[var_codon], var_codon, nt2codvec[var_codon],
                             collapse = ", ")
      
      }
      
    }
    
  }
  
  ## concatenate different nt and aa together
  diff_all <- c(diff_aa, diff_nt, diff_codon)
      
  ## add list names for unnesting later 
  names(diff_all) <- c("diff_aa", "diff_nt", "diff_codon")
  
  ## return list
  return(diff_all)
  
}

###############################################################################

## function to call pathogenicity according to ACMG criteria (Richards, 2015)
## call this function within mutate() when modifying a dataframe
reclassify_vars <- function(very_strong = "very_strong",
                            strong = "strong",
                            moderate = "moderate",
                            supporting = "supporting") {
  
  ## determine resulting classification
  case_when(
    ## pathogenic
    very_strong == 1 & strong >= 1 ~ "P",
    very_strong == 1 & moderate >= 2 ~ "P",
    very_strong == 1 & moderate == 1 & supporting == 1 ~ "P",
    very_strong == 1 & supporting >= 2 ~ "P",
    strong >= 2 ~ "P",
    strong == 1 & moderate >= 3 ~ "P",
    strong == 1 & moderate == 2 & supporting >= 2 ~ "P",
    strong == 1 & moderate == 1 & supporting >= 4 ~ "P",
    ## likely pathogenic
    very_strong == 1 & moderate == 1 ~ "LP",
    strong == 1 & moderate > 0 & moderate <= 2 ~ "LP",
    strong == 1 & supporting >= 2 ~ "LP",
    moderate >= 3 ~ "LP",
    moderate == 2 & supporting >= 2 ~ "LP",
    moderate == 1 & supporting >= 4 ~ "LP",
    ## VUS (all other combinations)
    TRUE ~ "VUS")
  
}

## find distance between query position and list of feature positions
find_distance <- function(x, y){
  
  ## retain position
  position <- x
    
  ## find closest feature position
  closest <- y[which.min(abs(x - y))]
  
  ## calculate distance between query and feature
  distance <- abs(x - closest)
  
  ## return all three as a string vector
  return(paste(position, closest, distance))
  
}

```

```{r reference information}

## WT gene sequence in nucleotides (does not include engineered linker/tag/TMD)
wt_FIX_nt <- "ATGCAGCGCGTGAACATGATCATGGCAGAATCACCAGGCCTCATCACCATCTGCCTTTTAGGATATCTACTCAGTGCTGAATGTACAGTTTTTCTTGATCATGAAAACGCCAACAAAATTCTGAATCGGCCAAAGAGGTATAATTCAGGTAAATTGGAAGAGTTTGTTCAAGGGAACCTTGAGAGAGAATGTATGGAAGAAAAGTGTAGTTTTGAAGAAGCACGAGAAGTTTTTGAAAACACTGAAAGAACAACTGAATTTTGGAAGCAGTATGTTGATGGAGATCAGTGTGAGTCCAATCCATGTTTAAATGGCGGCAGTTGCAAGGATGACATTAATTCCTATGAATGTTGGTGTCCCTTTGGATTTGAAGGAAAGAACTGTGAATTAGATGTAACATGTAACATTAAGAATGGCAGATGCGAGCAGTTTTGTAAAAATAGTGCTGATAACAAGGTGGTTTGCTCCTGTACTGAGGGATATCGACTTGCAGAAAACCAGAAGTCCTGTGAACCAGCAGTGCCATTTCCATGTGGAAGAGTTTCTGTTTCACAAACTTCTAAGCTCACCCGTGCTGAGACTGTTTTTCCTGATGTGGACTATGTAAATTCTACTGAAGCTGAAACCATTTTGGATAACATCACTCAAAGCACCCAATCATTTAATGACTTCACTCGGGTTGTTGGTGGAGAAGATGCCAAACCAGGTCAATTCCCTTGGCAGGTTGTTTTGAATGGTAAAGTTGATGCATTCTGTGGAGGCTCTATCGTTAATGAAAAATGGATTGTAACTGCTGCCCACTGTGTTGAAACTGGTGTTAAAATTACAGTTGTCGCAGGTGAACATAATATTGAGGAGACAGAACATACAGAGCAAAAGCGAAATGTGATTCGAATTATTCCTCACCACAACTACAATGCAGCTATTAATAAGTACAACCATGACATTGCCCTTCTGGAACTGGACGAACCCTTAGTGCTAAACAGCTACGTTACACCTATTTGCATTGCTGACAAGGAATACACGAACATCTTCCTCAAATTTGGATCTGGCTATGTAAGTGGCTGGGGAAGAGTCTTCCACAAAGGGAGATCAGCTTTAGTTCTTCAGTACCTTAGAGTTCCACTTGTTGACCGAGCCACATGTCTTCGATCTACAAAGTTCACCATCTATAACAACATGTTCTGTGCTGGCTTCCATGAAGGAGGTAGAGATTCATGTCAAGGAGATAGTGGGGGACCCCATGTTACTGAAGTGGAAGGGACCAGTTTCTTAACTGGAATTATTAGCTGGGGTGAAGAGTGTGCAATGAAAGGCAAATATGGAATATATACCAAGGTATCCCGGTATGTCAACTGGATTAAGGAAAAAACAAAGCTCACT"

## WT gene sequence in amino acids (does not include engineered linker/tag/TMD)
wt_FIX <- "MQRVNMIMAESPGLITICLLGYLLSAECTVFLDHENANKILNRPKRYNSGKLEEFVQGNLERECMEEKCSFEEAREVFENTERTTEFWKQYVDGDQCESNPCLNGGSCKDDINSYECWCPFGFEGKNCELDVTCNIKNGRCEQFCKNSADNKVVCSCTEGYRLAENQKSCEPAVPFPCGRVSVSQTSKLTRAETVFPDVDYVNSTEAETILDNITQSTQSFNDFTRVVGGEDAKPGQFPWQVVLNGKVDAFCGGSIVNEKWIVTAAHCVETGVKITVVAGEHNIEETEHTEQKRNVIRIIPHHNYNAAINKYNHDIALLELDEPLVLNSYVTPICIADKEYTNIFLKFGSGYVSGWGRVFHKGRSALVLQYLRVPLVDRATCLRSTKFTIYNNMFCAGFHEGGRDSCQGDSGGPHVTEVEGTSFLTGIISWGEECAMKGKYGIYTKVSRYVNWIKEKTKLT"

###############################################################################

## convert WT FIX sequence to dataframe to use as dictionary
wt_FIX_aa <- tibble(wt_aa =
                      unlist(str_extract_all(wt_FIX, boundary("character"))),
                    position = seq(1, nchar(wt_FIX), by = 1))

## convert WT sequence dataframe to hash table for faster lookup
hash_wt_FIX <- hash(keys = wt_FIX_aa$position,
                    values = wt_FIX_aa$wt_aa)

###############################################################################

## import codon conversion table
codon_table <- read_csv(here("inputs", "codon_table", "codon_table.csv"))

## convert codon table to hash table for faster lookup
hash_codon_table <- hash(keys = codon_table$codon,
                         values = codon_table$aa)

###############################################################################

## import amino acid abbreviation table
aa_table <- read_csv(here("inputs", "codon_table", "aa_conversion.csv"))

## convert amino acid abbreviation table to hash table for faster lookup
    ## one letter to three letter abbreviation
hash_aa1_to_aa3 <- hash(keys = aa_table$aa1,
                        values = aa_table$aa3)

## convert amino acid abbreviation table to hash table for faster lookup
    ## three letter to one letter abbreviation
hash_aa3_to_aa1 <- hash(keys = aa_table$aa3,
                        values = aa_table$aa1)

###############################################################################

## read in protein domain table for FIX
domains <- read_csv(here("inputs", "protein_information", "domains.csv")) %>%
  ## make domain names a factor so they plot in default order
  mutate(domain = factor(domain, levels = c("Signal peptide", "Propeptide",
                                            "Gla", "EGF1", "EGF2", 
                                            "Activation peptide",
                                            "Serine protease")),
         domain_short = factor(domain_short, levels = c("SP", "PP", "Gla",
                                                        "EGF1", "EGF2", "AP",
                                                        "SPD")))

###############################################################################

## create antibody table for easy labeling
antibody_table <- tibble(antibody = c("001", "3570", "102", "124", "strep"),
                         antibody_nonnum = c("ab001", "ab3570", "ab102", "ab124", "abstrep"),
                         antibody_label = c("Carboxylation-sensitive FIX-specific antibody",
                                            "Carboxylation-sensitive Gla-motif antibody",
                                            "Heavy chain antibody",
                                            "Light chain antibody",
                                            "Strep II tag antibody")) %>%
  ## make two-line version for some plots
  mutate(antibody_label2 = gsub("sensitive ", "sensitive\n", antibody_label))

###############################################################################

## define positions within library tiles
tile1 <- seq(from = 1, to = 164, by = 1)
tile2 <- seq(from = 146, to = 318, by = 1)
tile3 <- seq(from = 299, to = 461, by = 1)

## define tile overlapping positions, if any
overlap12 <- intersect(x = tile1, y = tile2)
overlap23 <- intersect(x = tile2, y = tile3)

###############################################################################

## create easy reference variables for RefSeq/Ensembl/UniProt accessions for FIX
refseq_cDNA <- "NM_000133.4"
refseq_protein <- "NP_000124.1"
refseq_gDNA <- "NC_000023.11"
ensembl_transcript_id <- "ENST00000218099"
ensembl_protein_id <- "ENSP00000218099"
uniprot_id <- "P00740"

```

```{r analyze PacBio sequencing data}

## import PacBio data 
all_barcodes <- read_table(here("inputs", "pacbio",
                                "round1-2_FIX_allbc_noCterm_barcode_variant_map_cutoff3.txt"),
                           col_names = c("barcode", "sequence"))

###############################################################################

## call variants
all_barcodes_varcalled <- all_barcodes %>%
  ## variant_caller() requires rowwise(), not sure why
  rowwise() %>%
  ## annotate variants
  mutate(var = list(variant_caller(nt1 = wt_FIX_nt, nt2 = sequence))) %>%
  ## unlist variants for aa and nt
  unnest_wider(col = var) %>%
  ## alter stop codon from * to X for easier analysis later
  mutate(diff_aa = gsub("\\*", "X", diff_aa)) %>%
  ## remove extraneous columns
  select(-sequence)

###############################################################################

## catalog variant types
all_barcodes_varcalled_type <- all_barcodes_varcalled %>%
  ## count variants
  mutate(var_count = case_when(str_count(diff_aa, ":") > 0 ~ 0,
                               diff_aa == "WT" ~ 0,
                               diff_aa == "not seen" ~ 0,
                               TRUE ~ str_count(diff_aa, ",") + 1),
         ## split single variants into WT aa, variant aa, and position
         wt_aa = case_when(var_count == 1 ~ str_sub(diff_aa, start = 1L, end = 1L),
                           diff_aa == "WT" ~ "WT",
                           TRUE ~ "XXX"),
         var_aa = case_when(var_count == 1 & grepl("del", diff_aa) == TRUE ~
                              str_sub(diff_aa, start = -3L, end = -1L),
                            var_count == 1 & grepl("del", diff_aa) == FALSE ~
                              str_sub(diff_aa, start = -1L, end = -1L),
                            diff_aa == "WT" ~ "WT",
                           TRUE ~ "YYY"),
         position = case_when(var_count == 1 ~ as.numeric(str_extract(diff_aa, "[0-9]+")),
                              diff_aa == "WT" ~ 0,
                           TRUE ~ 0),
         ## aggregate >5 variants
         var_count = case_when(var_count > 5 ~ "6+",
                               TRUE ~ as.character(var_count)),
         ## classify variants
         var_type = case_when(diff_aa == "not seen" ~ "not seen or filtered",
                              diff_aa == "WT" ~ "0 - WT",
                              str_count(diff_aa, ":") > 0 ~ "indel",
                              var_count == "1" & var_aa == "del" ~ "1 - codon deletion",
                              var_count == "1" & wt_aa == var_aa ~ "1 - synonymous",
                              var_count == "1" & var_aa == "X" ~ "1 - nonsense",
                              var_count == "1" ~ "1 - missense",
                              TRUE ~ var_count))

###############################################################################

## isolate only single variant barcodes
all_barcodes_varcalled_single <- all_barcodes_varcalled_type %>%
  ## filter to include only specified round and single variants
  filter(grepl("1 - ", var_type) == TRUE | diff_aa == "WT") %>%
  filter(grepl("del", var_type) != TRUE) %>%
  ## remove unnecessary columns
  select(barcode, diff_aa, diff_nt, diff_codon) %>%
  ## change position of WT as 0, extract position otherwise
  mutate(position = case_when(diff_aa == "WT" ~ 0,
                              TRUE ~ as.numeric(str_extract(diff_aa, "[0-9]+"))),
         ## identify WT and variant amino acids, replace WT as NA
         wt_aa = case_when(diff_aa != "WT" ~ str_sub(diff_aa, start = 1L, end = 1L),
                           TRUE ~ NA_character_),
         var_aa = case_when(diff_aa != "WT" ~ str_sub(diff_aa, start = -1L, end = -1L),
                            TRUE ~ NA_character_))

###############################################################################

## write barcode map to csv
write_csv(all_barcodes_varcalled_single,
          here("outputs", "barcode_map",
               "MultiSTEP_F9_barcode_variant_map_singlevars.csv"))

###############################################################################

## calculate statistics on unique values in PacBio libraries
pacbio_variants <- all_barcodes_varcalled_type %>%
  group_by(var_type) %>%
  summarise(across(.cols = c(contains("diff"), "barcode"),
                   .fns = n_distinct,
                   .names = "n_{.col}")) %>%
  adorn_totals(where = c("row", "col"))

## calculate number of single codon variants in FIX scored by PacBio
pacbio_num_codon_variants <- pacbio_variants %>%
  ## keep only single codon variants not including indels
  filter(grepl("1", var_type)) %>%
  filter(!grepl("deletion", var_type)) %>%
  ## revert to non-tabyl dataframe
  untabyl() %>%
  ## get totals 
  adorn_totals("row")

```

```{r analyzing Human Protein Atlas data}

## read in all genes in Human Protein Atlas
HPA_all <- read_tsv(here("inputs", "human_protein_atlas",
                         "proteinatlas_f7d77999.tsv")) %>%
  ## clean names
  clean_names() %>%
  ## pull out localization, FDA approved drugs, and human disease association
  mutate(localization = case_when(grepl("Predicted secreted",
                                        protein_class) == TRUE ~ "Secreted",
                                  grepl("Predicted membrane",
                                        protein_class) == TRUE ~ "Membrane",
                                  TRUE ~ "Intracellular"))

###############################################################################

## count genes by localization 
HPA_secreted <- HPA_all %>% 
  count(localization) %>%
  ## calculate boundaries for donut plot
  mutate(frac = round(n / sum(n), digits = 3),
         ymax = cumsum(frac),
         ymin = lag(ymax, default = 0),
         mid = ymax - 0.5*frac)

###############################################################################

## Fig. 1a: donut plot of genes by localization 
secreted_plot <- ggplot() +
  ## create background boxes with fill and grey border
  geom_rect(data = HPA_secreted,
            aes(xmin = 5, xmax = 6.5,
                ymin = ymin, ymax = ymax,
                fill = localization), color = "black") +
  ## make circular
  coord_polar(theta = "y") +
  ## add labels with both number and percent of genes in cellular compartment
  geom_text(data = HPA_secreted,
            aes(label = paste0(comma(n), "\n", round(frac * 100, digits = 0), "%"),
            x = 5.75, y = mid),
            vjust = 0.5, hjust = 0.5, size = 6 / .pt) +
  ## rescale x axis and fill colors
  scale_x_continuous(limits = c(3, 6.5)) +
  scale_fill_manual(
    values = lighten(as.list(paletteer_d("PNWColors::Sunset")[c(5, 3, 1)]),
                     amount = 0.2)) +
  ## remove all axes
  theme_void(base_size = 6) +
  ## alter figure legend position, size, and colors
  guides(fill = guide_legend(override.aes = list(size = 0.1, color = NA))) +
  theme(legend.position.inside = c(0.5, 0.5),
        legend.title = element_blank(), 
        legend.text  = element_text(size = 6),
        legend.key.size = unit(0.7, "lines"))

## save Fig. 1a
ggsave(here("outputs", "main_fig_panels", "1a_secreted_proteins_HPA.pdf"),
       plot = secreted_plot, device = cairo_pdf,
       height = 50, width = 50, units = "mm")

###############################################################################

## create list of secreted genes for later use with ClinVar
secreted_gene_list <- HPA_all %>%
  filter(localization == "Secreted") %>%
  select(gene)

```

```{r Clinvar secreted data}

## import yearly ClinVar data from December 2016 to June 2023
## and only include single nucleotide variants that pass filters
all_clinvar <- list.files(path = here("inputs", "ClinVar", "by_year"),
                          pattern = "*.txt",
                          recursive = TRUE) %>%
  ## then read each in using read_clinvar_path, which adds a path column
  ## that can be used to isolate the year
  map_df(~read_clinvar_path(here("inputs", "ClinVar", "by_year", .))) %>%
  ## remove duplicates (hg37 and hg38)
  distinct() %>%
  ## keep only single variants
  filter(Type == "single nucleotide variant") %>%
  ## split Name into transcript and protein variant
  extract(col = Name, into = c("Transcript", "protein_variant"),
          regex = "(.*) \\((.*)\\)") %>%
  ## drop missing protein variants
  drop_na(protein_variant) %>%
  ## remove synonymous, nonsense
  filter(!grepl("Ter|=", protein_variant)) %>%
  ## remove variants without assertions
  filter(!grepl("no assertion|no interpretation|-", ReviewStatus)) %>%
  ## keep only B/LB/VUS/LP/P
  filter(!grepl(";|,|risk|not|other|drug|Affects|association",
                ClinicalSignificance)) %>%
  ## convert classifications
  mutate(ClinicalSignificance =
           case_when(grepl("Conflicting", ClinicalSignificance) ~ "VUS",
                     grepl("Uncertain", ClinicalSignificance) ~ "VUS",
                     grepl("Benign/", ClinicalSignificance) ~ "Likely benign",
                     grepl("Pathogenic/", ClinicalSignificance) ~ "Likely pathogenic",
                     TRUE ~ ClinicalSignificance)) %>%
  ## keep only necessary columns
  select(GeneSymbol, year, ClinicalSignificance)

###############################################################################

## count secreted gene variants by classification in each year
clinvar_secreted <- all_clinvar %>%
  ## remove nonsecreted genes
  inner_join(secreted_gene_list, by = c("GeneSymbol" = "gene")) %>%
  ## count
  count(year, ClinicalSignificance) %>%
  ## divide by thousands
  mutate(nthousands = n / 1000)

###############################################################################

## Fig. 1b - ClinVar variants for secreted proteins, annual
secreted_clinvar_plot <- ggplot(clinvar_secreted,
                                aes(x = year,
                                    y = nthousands,
                                    group = ClinicalSignificance,
                                    color = ClinicalSignificance)) +
  ## line graph
  geom_line(linewidth = 0.5) +
  ## points
  geom_point(size = 0.5) +
  ## rescale colors, x and y axes
  scale_color_manual(values = c('Pathogenic' = 'firebrick3',
                                'Likely pathogenic' = 'coral',
                                'Benign' = 'royalblue3',
                                'Likely benign' = 'steelblue1',
                                'VUS' = 'grey50')) + 
  scale_x_discrete(expand = c(0.05, 0)) +
  scale_y_continuous(expand = c(0, 0),
                     limits = c(-1, 91), 
                     breaks = seq(0, 90, by = 30)) +
  ## alter legend position, size, color, location
  guides(color = guide_legend(override.aes = list(size = 0.5,
                                                  linewidth = 0.5))) +
  theme(legend.key.width = unit(3, 'mm'),
        legend.key.height = unit(3, "mm"),
        legend.key = element_rect(fill = NA, color = NA),
        legend.title = element_blank(),
        legend.position = c(0, 1),
        legend.justification = c(0, 1),
        legend.background = element_rect(fill = NA, color = NA),
        ## change axis text to be angled for horizontal space compression
        axis.text.x = element_text(angle = 45, hjust = 1)) +
  ## add axis labels
  labs(x = "Year",
       y = "Secreted missense variants\n(thousands)")

## save Fig. 1b
ggsave(here("outputs", "main_fig_panels", "1b_secreted_clinvar.pdf"),
       plot = secreted_clinvar_plot, device = cairo_pdf,
       height = 43, width = 43, units = "mm")

```

```{r MultiSTEP construct comparison}

## import MultiSTEP construct experiment
construct_pilot <- list.files(path = here("inputs", "flow",
                                          "strep_linkers",
                                          "scale"),
                             pattern = "*.csv",
                             recursive = TRUE) %>%
  map_df(~read_flow_path(here("inputs", "flow",
                              "strep_linkers", 
                              "scale", .))) %>%
  ## clean names
  clean_names() %>%
  ## rename Flowjo default variable names
  rename(abstrep = fitc_a,
         bfp = bv_450_a,
         mcherry = pe_texas_red_a) %>%
  ## extract source path column to usable variables
  extract(col = source_path, into = c("construct"),
          regex = ".*100_(.*)_00[0-9]_.*") %>%
  ## adjust so that no negative values remain and
  ## add small pseudocount for log plotting to prevent log(0) errors
  mutate(adj_abstrep = abstrep + abs(min(abstrep)) + 0.01,
         ## change names and order
         construct = case_when(construct == "poly_G384A" ~ "Unrecombined",
                               construct == "WT" ~ "MultiSTEP:\nL1-Strep",
                               TRUE ~ "MultiSTEP:\nL1-Strep-L2"),
         construct = factor(construct,
                            levels = c("Unrecombined",
                                       "MultiSTEP:\nL1-Strep",
                                       "MultiSTEP:\nL1-Strep-L2")))

###############################################################################

## Fig. S2b - pilot constructs
construct_plot <- construct_pilot %>%
  ggplot(aes(x = adj_abstrep,
             y = construct)) +
  ## add density distributions from flow cytometry
  geom_density_ridges(aes(fill = construct,
                          color = construct,
                          height = after_stat(ndensity)),
                      show.legend = FALSE, alpha = 0.7, bandwidth = 1/64,
                      scale = 1) +
  ## adjust fill, x and y axes
  scale_x_log10(limits = c(9.95, 1.05e4),
                breaks = trans_breaks("log10", function(x) 10^x, n = 4),
                labels = trans_format("log10", math_format(10^.x))) +
  scale_y_discrete(limits = rev) +
  scale_fill_viridis_d(option = "C", end = 0.8) +
  scale_color_viridis_d(option = "C", end = 0.8) +
  ## add labels
  labs(x = "Strep II tag-Alexa-488") +
  ## adjust grid lines, remove y axis title
  theme(panel.grid.major.y = element_line(color = "grey40"),
        axis.title.y = element_blank())

## save Fig. S1g
ggsave(here("outputs", "supp_fig_panels", "S1g_constructs.pdf"),
       plot = construct_plot, device = cairo_pdf,
       height = 42, width = 40, units = "mm")

```

```{r heavy chain pilot experiment}

## import heavy chain pilot data with known variants
pilot_102 <- list.files(path = here("inputs", "flow",
                                    "pilot_variants_102",
                                    "scale"),
                             pattern = "*.csv",
                             recursive = TRUE) %>%
  map_df(~read_flow_path(here("inputs", "flow",
                              "pilot_variants_102", 
                              "scale", .))) %>%
  ## clean names
  clean_names() %>%
  ## rename Flowjo default variable names
  rename(ab102 = alexa_fluor_647_a,
         abstrep = alexa_fluor_488_a,
         bfp = pacific_blue_a,
         mcherry = m_cherry_a) %>%
  ## extract source path column to usable variables
  extract(col = source_path, into = c("variant", "fluor"),
          regex = ".*-([A-z]+[0-9]{0,3}[A-Z]?)_.*_(.*)\\+.csv") %>%
  ## adjust so that no negative values remain and
  ## add small pseudocount for log plotting to prevent log(0) errors
  mutate(adj_ab102 = ab102 + abs(min(ab102)) + 0.01,
         ## change names and order
         variant = case_when(variant == "control" ~ "Unrecombined",
                             TRUE ~ variant),
         variant = factor(variant,
                          levels = c("Unrecombined", "WT", "C28Y",
                                     "A37T", "S220T")))

###############################################################################

## Fig. 1e - pilot heavy chain
pilot_102_plot <- pilot_102 %>%
    ggplot(aes(x = adj_ab102,
               y = variant)) +
  ## add density distributions from flow cytometry
  geom_density_ridges(aes(fill = variant,
                          color = variant,
                          height = after_stat(ndensity)),
                      show.legend = FALSE,
                      scale = 1, alpha = 0.7, bandwidth = 1/64) +
  ## adjust fill, x and y axes
  scale_x_log10(limits = c(9.95, 1.05e6),
                breaks = trans_breaks("log10", function(x) 10^x, n = 4),
                labels = trans_format("log10", math_format(10^.x))) +
  scale_y_discrete(limits = rev) +
  scale_fill_viridis_d(option = "C", end = 0.8) +
  scale_color_viridis_d(option = "C", end = 0.8) +
    ## add labels
    labs(x = "Heavy chain-Alexa-647") +
    ## adjust grid lines, remove y axis title
    theme(panel.grid.major.y = element_line(color = "grey40"),
          axis.title.y = element_blank())

## save Fig. 1e
ggsave(here("outputs", "main_fig_panels", "1e_heavy_chain.pdf"),
       plot = pilot_102_plot, device = cairo_pdf,
       height = 45, width = 55, units = "mm")

```

```{r light chain pilot experiment}

## import light chain pilot data with known variants
pilot_124 <- list.files(path = here("inputs", "flow",
                                    "pilot_variants_124",
                                    "scale"),
                             pattern = "*.csv",
                             recursive = TRUE) %>%
  map_df(~read_flow_path(here("inputs", "flow",
                              "pilot_variants_124", 
                              "scale", .))) %>%
  ## clean names
  clean_names() %>%
  ## rename Flowjo default variable names
  rename(ab124 = alexa_647_a,
         abstrep = gfp_a,
         bfp = bfp_a,
         mcherry = m_cherry_yg_a) %>%
  ## extract source path column to useable variables
  extract(col = source_path, into = c("variant", "fluor"),
          regex = ".*scale_Rep_.*_([A-z]+[0-9]{0,3}[A-Z]?)_rep.*_(.*)\\+.csv") %>%
  ## adjust so that no negative values remain and
  ## add small pseudocount for log plotting to prevent log(0) errors
  mutate(adj_ab124 = ab124 + abs(min(ab124)) + 0.01,
         adj_abstrep = abstrep + abs(min(abstrep)) + 0.01,
         ## change names and order
         variant = case_when(variant == "neg" ~ "Unrecombined",
                             TRUE ~ variant),
         variant = factor(variant,
                          levels = c("Unrecombined", "WT", "C28Y",
                                     "A37T", "S220T")))

###############################################################################

## Fig. 1f - pilot light chain
pilot_124_plot <- pilot_124 %>%
  ggplot(aes(x = adj_ab124,
             y = variant)) +
  ## add density distributions from flow cytometry
  geom_density_ridges(aes(fill = variant,
                          color = variant,
                          height = after_stat(ndensity)),
                      show.legend = FALSE,
                      scale = 1, alpha = 0.7, bandwidth = 1/64) +
  ## adjust fill, x and y axes
  scale_x_log10(limits = c(9.95, 1.05e6),
                breaks = trans_breaks("log10", function(x) 10^x, n = 4),
                labels = trans_format("log10", math_format(10^.x))) +
  scale_y_discrete(limits = rev) +
  scale_fill_viridis_d(option = "C", end = 0.8) +
  scale_color_viridis_d(option = "C", end = 0.8) +
  ## add labels
  labs(x = "Light chain-Alexa-647") +
  ## adjust grid lines, remove y axis title
  theme(panel.grid.major.y = element_line(color = "grey40"),
        axis.title.y = element_blank())

## save Fig. 1f
ggsave(here("outputs", "main_fig_panels", "1f_light_chain.pdf"),
       plot = pilot_124_plot, device = cairo_pdf,
       height = 45, width = 55, units = "mm")

```

```{r strep tag pilot experiment}

## import light chain pilot data with known variants
pilot_strep <- list.files(path = here("inputs", "flow",
                                    "pilot_variants_strep",
                                    "scale"),
                             pattern = "*.csv",
                             recursive = TRUE) %>%
  map_df(~read_flow_path(here("inputs", "flow",
                              "pilot_variants_strep", 
                              "scale", .))) %>%
  ## clean names
  clean_names() %>%
  ## rename Flowjo default variable names
  rename(abstrep = alexa_fluor_647_a,
         bfp = pacific_blue_a,
         mcherry = m_cherry_a) %>%
  ## extract source path column to useable variables
  extract(col = source_path, into = c("variant", "fluor"),
          regex = ".*_293F_([A-z]+[0-9]{0,3}[A-Z]?)_rep.*_(.*)\\+.csv") %>%
  ## adjust so that no negative values remain and
  ## add small pseudocount for log plotting to prevent log(0) errors
  mutate(adj_abstrep = abstrep + abs(min(abstrep)) + 0.01,
         ## change names and order
         variant = case_when(variant == "neg" ~ "Unrecombined",
                             TRUE ~ variant),
         variant = factor(variant,
                          levels = c("Unrecombined", "WT", "C28Y",
                                     "A37T", "S220T")))

###############################################################################

## Fig. 1g - pilot strep tag
pilot_strep_plot <- pilot_strep %>%
  ggplot(aes(x = adj_abstrep,
             y = variant)) +
  ## add density distributions from flow cytometry
  geom_density_ridges(aes(fill = variant,
                          color = variant,
                          height = after_stat(ndensity)),
                      show.legend = FALSE, alpha = 0.7, bandwidth = 1/64,
                      scale = 1) +
  ## adjust fill, x and y axes
  scale_x_log10(limits = c(9.95, 1.05e6),
                breaks = trans_breaks("log10", function(x) 10^x, n = 4),
                labels = trans_format("log10", math_format(10^.x))) +
  scale_y_discrete(limits = rev) +
  scale_fill_viridis_d(option = "C", end = 0.8) +
  scale_color_viridis_d(option = "C", end = 0.8) +
  ## add labels
  labs(x = "Strep II tag-Alexa-488") +
  ## adjust grid lines, remove y axis title
  theme(panel.grid.major.y = element_line(color = "grey40"),
        axis.title.y = element_blank())

## save Fig. 1g
ggsave(here("outputs", "main_fig_panels", "1g_strep.pdf"),
       plot = pilot_strep_plot, device = cairo_pdf,
       height = 45, width = 55, units = "mm")

```

```{r read in Illumina sequencing data}

## import data from all Illumina sequencing runs
## recursively looks in each folder for a csv file with "all_barcode_counts",
## which is how my Illumina processing script outputs file names
## and then clean up sample names, and extract technical replicate info
illumina_data <- list.files(path = here("inputs", "illumina"),
                            pattern = "*all_barcode_counts.csv",
                            recursive = TRUE) %>%
  ## read in files in parallel to improve speed
  ## assign column names
  map_df(~read_csv(here("inputs", "illumina", .),
                   col_names = c("barcode", "reads_mapping", "sample"))) %>%
  ## remove illumina indexed sample number from sample column
     ## specific to my sample naming scheme
  mutate(sample = gsub("_S[0-9]+$", "", sample)) %>%
  ## split out technical replicate from sample using regex
     ## specific to my sample naming scheme
  extract(sample, into = c("sample", "tech_rep"), "(.*)_([^_]+$)") %>%
  ## widen data to have each technical replicate as a column
  ## fill in barcodes that are present in one replicate but not the other
  ## with zero reads (as they were not seen)
  pivot_wider(names_from = tech_rep,
              values_from = reads_mapping,
              values_fill = 0) %>%
  ## sum technical replicates
  mutate(total_reads = rep1 + rep2)
  
```

```{r map barcodes}

## join illumina reads with barcode map for single variants
## should only keep barcodes that are present in the barcode variant map
mapped_variants <- inner_join(illumina_data,
                              all_barcodes_varcalled_single,
                              by = "barcode") %>%
  ## remove unused information about nucleotides 
  select(-diff_nt, -diff_codon)

```

```{r scoring variants}

## set frequency and replicate count filters
frequency_filter <- 1e-6
count_filter <- 2

###############################################################################

## split off WT to make complete function in next steps easier
mapped_wt <- mapped_variants %>% 
  filter(diff_aa == "WT")

## modify dataframe to easily calculate functional scores
cleaned_mapped_variants <- mapped_variants %>%
  filter(diff_aa != "WT") %>%
  ## fill in missing data across samples
  ## heatmap plotting functions all require explicit missing data
  ## otherwise, it will not plot them as "missing"
  complete(position = seq(1, nchar(wt_FIX), by = 1), var_aa, sample,
           fill = list(NA)) %>%
  ## replace NA values for wt_aa and diff_aa for missing variants since 
  ## complete fills in NA for missing variables
  mutate(wt_aa = wt_FIX_aa$wt_aa[position],
         diff_aa = case_when(is.na(diff_aa) == TRUE ~ paste0(wt_aa, position, var_aa),
                             TRUE ~ diff_aa)) %>%
  ## filter out variants not in the correct library tile
  ## first by mutating to a filter column based on data
  ## this is necessary for any experiment with sublibraries, since the score is
  ## based on reads that match the expecting sublibrary
  mutate(filter = case_when(grepl("tile1", sample) == TRUE & position %in% tile1 ~ TRUE,
                            grepl("tile2", sample) == TRUE & position %in% tile2 ~ TRUE,
                            grepl("tile3", sample) == TRUE & position %in% tile3 ~ TRUE,
                            TRUE ~ FALSE)) %>%
  ## then by removing all FALSE values
  filter(filter == TRUE) %>%
  ## and removing filter column, as it is no longer needed
  select(-filter) %>%
  ## bind rows of WT data to re-form full data set 
  bind_rows(mapped_wt) %>%
  ## replace missing values for sequencing with 0
  replace_na(list(total_reads = 0)) %>%
  ## sum duplicate reads by variant per tile, antibody, bin, and biological replicate
  ## this is the "binwise" count
  group_by(wt_aa, position, var_aa, sample) %>%
  mutate(total_binwise_reads = sum(total_reads)) %>%
  ungroup() %>%
  ## remove unnecessary columns and select only unique values
  select(position, wt_aa, var_aa, total_binwise_reads, sample) %>%
  distinct() %>%
  ## separate sample into tile, bin, experimental replicate, and condition/antibody
  ## for calculating weighted average
  separate(sample, into = c("tile", "antibody", "exp_replicate", "bin"),
           sep = "_|-") 

###############################################################################

## calculate variant frequencies for filtering poorly sequenced variants
variants_with_filter_frequencies <- cleaned_mapped_variants %>%
  ## sum all reads for each variant across each experiment (all bins)
  group_by(wt_aa, position, var_aa, tile, antibody, exp_replicate) %>%
  reframe(all_variant_reads = sum(total_binwise_reads)) %>%
  ## calculate overall variant frequency across each experiment (all bins)
  group_by(tile, antibody, exp_replicate) %>%
  mutate(all_variant_reads_frequency = all_variant_reads / sum(all_variant_reads)) %>%
  ungroup() %>%
  ## remove unnecessary columns
  select(-all_variant_reads) %>%
  ## join back with original data to retain bin information
  right_join(cleaned_mapped_variants)

###############################################################################

## score variants according to VAMP-seq weights (weighted average)
scored_variants <- variants_with_filter_frequencies %>% 
  ## convert reads to 0 if not above frequency filter (prevents scoring)
  mutate(total_binwise_reads = case_when(
    all_variant_reads_frequency < frequency_filter ~ 0,
    TRUE ~ total_binwise_reads)) %>%
  ## calculate binwise frequency for each variant in each sample, which will be
  ## used later to score variants
  ## do after filtering to ensure sum of all binwise frequencies in an experiment = 1.
  group_by(tile, antibody, bin, exp_replicate) %>%
  mutate(binwise_frequency = total_binwise_reads / sum(total_binwise_reads)) %>%
  ungroup() %>%
  ## adjust frequencies by weights
  group_by(wt_aa, position, var_aa, tile, antibody, exp_replicate) %>%
  ## calculate a weighted frequency within each bin for each variant per experiment
  mutate(weighted_value = case_when(bin == "bin1" ~ 0.25*binwise_frequency,
                                    bin == "bin2" ~ 0.50*binwise_frequency,
                                    bin == "bin3" ~ 0.75*binwise_frequency,
                                    bin == "bin4" ~ 1.00*binwise_frequency,
                                    TRUE ~ 0)) %>%
  ## average the values (range should be between 0.25 and 1.00 for default values)
  reframe(weighted_average = sum(weighted_value) / sum(binwise_frequency)) %>%
  ## min-max normalize the data within each experiment
  group_by(tile, antibody, exp_replicate) %>%
  ## first, identify type of amino acid substitution
  mutate(type = case_when(position == 0 ~ "WT",
                          var_aa == "X" ~ "nonsense",
                          var_aa == wt_aa ~ "synonymous",
                          TRUE ~ "missense"),
         ## calculate a min-max normalized score
            ## normalize between the median of the lowest 5% of variants (score = 0)
            ## and the median of the synonymous distribution (score = 1).
         ## calculate percentile of each variant in assay
         percentile = ntile(weighted_average, n = 20),
         ## calculate the median score for the lowest 5% of variants in assay
         ## median weighted average of lowest 5% of scores
         median_5 = median(subset(weighted_average, percentile == 1),
                           na.rm = TRUE),
         median_WT = median(subset(weighted_average, type == "WT"),
                            na.rm = TRUE),
         median_syn = median(subset(weighted_average, type == "synonymous"),
                             na.rm = TRUE),
         ## mix-max normalization of scores (WT = 1, median of lowest 5% = 0)
         score = (weighted_average - median_5) / (median_WT - median_5)) %>%
  ungroup() 

## remove unnecessary columns
scored_variants <- scored_variants %>%
  ## remove unnecessary columns
  select(wt_aa, position, var_aa, tile, antibody, exp_replicate, type, score) %>%
  ## remove all nonsense variants (not designed in library)
  filter(is.na(var_aa) | var_aa != "X")

```

```{r average scores}

## calculate the average score across each sublibrary and condition
scored_variants_sublibrary_average <- scored_variants %>%
  ## calculate number of replicate scores for each variant to use as a filter later on
  ## calculate the average of and standard error of each variant's scores
  ## across all sublibraries and experimental replicates
  group_by(tile, antibody, wt_aa, position, var_aa) %>%
  summarise(n_exp = sum(!is.na(score)),
            average_sublibrary_score = mean(score, na.rm = TRUE),
            se_score = sd(score, na.rm = TRUE) / sqrt(n_exp)) %>%
  ungroup() %>%
  ## retain only variants in overlaps between sublibraries
  filter(position %in% overlap12 | position %in% overlap23)

###############################################################################

## calculate the average score across each condition
scored_variants_average <- scored_variants %>%
  ## calculate number of replicate scores for each variant to use as a filter later on
  ## calculate the average of and standard error of each variant's scores
  ## across all sublibraries and experimental replicates 
  group_by(antibody, wt_aa, position, var_aa) %>%
  summarise(n_exp = sum(!is.na(score)),
            average_score = mean(score, na.rm = TRUE),
            se_score = sd(score, na.rm = TRUE) / sqrt(n_exp)) %>%
  ungroup() %>%
  ## adjust negative scores to be equal to 0, for easier plotting
  ## and remove scores for variants with < count_filter experiments
  mutate(average_score2 = case_when(n_exp < count_filter ~ NaN,
                                    TRUE ~ average_score),
         average_score2 = case_when(average_score2 <= 0 ~ 0,
                                    TRUE ~ average_score2),
         ## identify WT/synonymous residues for plotting later
         is_wt = case_when(wt_aa == var_aa ~ "WT",
                           position == 0 ~ "WT",
                           TRUE ~ "not WT"),
         ## recreate variant shorthand notation
         variant = case_when(position == 0 ~ "WT",
                             TRUE ~ paste0(wt_aa, position, var_aa)),
         ## make var_aa a factor so it plots in the order we want
         var_aa = factor(var_aa, levels = c("A", "V", "I", "L", "M", "F", "Y",
                                            "W", "S", "T", "N", "Q", "C", "G",
                                            "P", "R", "H", "K", "D", "E", "X")))

## wider version
scored_variants_average_wide <- scored_variants_average %>%
  ## join with FIX domains for coloring points
  left_join(domains, by = "position") %>%
  ## keep only necessary columns
  select(antibody, variant, average_score2, domain) %>% 
  ## make wide, so every antibody is a column
  pivot_wider(names_from = antibody,
              names_prefix = "ab",
              values_from = average_score2)

```

```{r synonymous thresholds}

## find value synonymous thresholds that divide 5th percentile from the rest
syn_threshold <- scored_variants_average %>%
  ## select only synonymous variants (true WT have position = 0)
  filter(is_wt == "WT" & position != 0) %>%
  ## calculate lower (5th) and upper (95th) percentiles for each antibody
  group_by(antibody) %>%
  summarise(lower = quantile(average_score, 0.05, na.rm = TRUE),
            upper = quantile(average_score, 0.95, na.rm = TRUE))

###############################################################################

## wide version of synonymous thresholds
syn_threshold_wide <- syn_threshold %>%
  pivot_wider(names_from = "antibody",
              values_from = c("lower", "upper"),
              names_glue = "{.value}_{antibody}")

```

```{r C28Y in context of library}

## import C28Y and library flow data
c28y_lib <- list.files(path = here("inputs", "flow",
                                   "c28y_library"),
                             pattern = "*.csv",
                             recursive = TRUE) %>%
  map_df(~read_flow_path(here("inputs", "flow",
                              "c28y_library", .))) %>%
  ## clean names
  clean_names() %>%
  ## rename Flowjo default variable names
  rename(ab102_mcherry = derived) %>%
  ## extract source path column to usable variables
  extract(col = source_path, into = c("variant"),
          regex = "export_(.*)_mCherry.*\\.csv") %>%
  ## change names and order
  mutate(variant = case_when(variant == "lib-tile3" ~ "Library",
                             TRUE ~ toupper(variant)),
         variant = factor(variant,
                          levels = c("Library", "C28Y", "WT")))

###############################################################################

## Fig. S2a - C28Y vs. library
c28y_lib_plot <- c28y_lib %>%
  ggplot() +
  geom_density(aes(x = ab102_mcherry,
                   y = after_stat(scaled),
                   fill = variant, color = variant),
               alpha = 0.4, bw = 1/64) +
  ## adjust fill, x and y axes
  scale_x_log10(expand = c(0, 0),
                limits = c(7e-4, 3e0),
                breaks = trans_breaks("log10", function(x) 10^x, n = 4),
                labels = trans_format("log10", math_format(10^.x))) +
  scale_y_continuous(expand = c(0, 0),
                     limits = c(0, 1.01),
                     breaks = seq(0, 1, by = 0.25),
                     labels = function(x) as.character(x)) +
  scale_fill_manual(values = c("lightblue3", "orange", "tomato1")) +
  scale_color_manual(values = c("lightblue3", "orange", "tomato1")) +
  ## add labels
  labs(x = "Heavy chain-Alexa-647:mCherry ratio",
       y = "Density") +
  ## adjust legend location and features
  theme(legend.position = c(0, 1),
        legend.justification = c(0, 1),
        legend.key.size = unit(3, "mm"),
        legend.title = element_blank(),
        legend.background = element_rect(fill = NA)) +
  guides(color = guide_legend(reverse = TRUE),
         fill = guide_legend(reverse = TRUE))

## save Fig. S2a
ggsave(here("outputs", "supp_fig_panels", "S2a_C28Y_library.pdf"),
       plot = c28y_lib_plot, device = cairo_pdf,
       height = 40.7, width = 87.2, units = "mm")

```

```{r adherent}

## import MultiSTEP in adherent HEK-293T
adherent_variants <- list.files(path = here("inputs", "flow",
                                            "adherent",
                                            "scale"),
                                pattern = "*.csv",
                                recursive = TRUE) %>%
  map_df(~read_flow_path(here("inputs", "flow",
                              "adherent", 
                              "scale", .))) %>%
  ## clean names
  clean_names() %>%
  ## rename Flowjo default variable names
  rename(ab102 = apc_cy7_a) %>%
  ## extract source path column to usable variables
  extract(col = source_path, into = c("variant"),
          regex = ".*_346V_(.*)_[0-9].*\\+.csv") %>%
  ## adjust so that no negative values remain and
  ## add small pseudocount for log plotting to prevent log(0) errors
  mutate(adj_ab102 = ab102 + abs(min(ab102)) + 0.01,
         ## change names and order
         variant = case_when(variant == "NT" ~ "Negative control",
                             TRUE ~ variant)) %>%
  ## calculate geometric mean for each variant-replicate pair (thousands)
  group_by(variant) %>%
  summarise(geomean_adj_ab102 = exp(mean(log(adj_ab102))) / 1000) %>%
  ungroup() %>%
  ## join with scored variants
  left_join(scored_variants_average %>%
              filter(antibody == "102"),
            by = "variant") %>%
  ## add normalized (definition) score for negative control
  mutate(average_score2 = case_when(variant == "Negative control" ~ 0,
                                    TRUE ~ average_score2)) %>%
  replace_na(list(se_score = 0)) %>%
  filter(!is.na(average_score2))
  
## Fig. S3c - adherent comparison plot
adherent_plot <- adherent_variants %>%
  ggplot(aes(x = geomean_adj_ab102,
             y = average_score2)) +
  ## add points and error
  geom_linerange(aes(ymin = average_score2 - se_score,
                     ymax = average_score2 + se_score),
                 linewidth = 0.3) +
  geom_point(size = 1, color = "steelblue") +
  ## scale axes
  scale_x_continuous(expand = c(0, 0),
                     limits = c(-0.2, 15.2),
                     breaks = seq(0, 15, by = 5),
                     labels = function(x) as.character(x)) +
  scale_y_continuous(expand = c(0, 0),
                     limits = c(-0.05, 1.55),
                     breaks = seq(0, 1.5, by = 0.5),
                     labels = function(x) as.character(x)) +
  ## add correlation
  stat_cor(aes(label = paste(after_stat(r.label))),
                                label.y = 1.49, label.x = 0.01, hjust = 0, size = 6 / .pt) +
  ## line of best fit
  stat_smooth(geom = "line", linetype = "dashed", color = "black",
              method = "lm", fullrange = TRUE) +
  ## add labels
  labs(x = "Geometric mean heavy chain\nAlexa-647 (thousands, HEK-293T)",
       y = "Heavy chain secretion score\n(293-F)")

## save Fig. S3c - adherent
ggsave(here("outputs", "supp_fig_panels", "S3c_adherent.pdf"),
       plot = adherent_plot, device = cairo_pdf,
       height = 42.3, width = 42.3, units = "mm")

```

```{r check variant correlation}

## variant replication by tile
variant_correlation <- scored_variants %>%
  ## join with same table to get comparisons
  full_join(scored_variants, by = c("wt_aa", "position", "var_aa", 
                                    "antibody", "tile", "type"),
            multiple = "all", relationship = "many-to-many")%>%
  ## join with antibody table for easier labeling
  left_join(antibody_table, by = "antibody") %>%
  ## keep only useful columns
  select(wt_aa, position, var_aa, tile, exp_replicate.x, exp_replicate.y, score.x,
         score.y, antibody, antibody_label2) %>%
  ## remove WT
  filter(position != 0) %>%
  ## filter only AB comparisons (discard BA)
  filter(exp_replicate.x < exp_replicate.y) %>%
  ## remove any variants that are missing from all experiments (or cor fails)
  filter(if_all(contains("score"), ~ !is.na(.))) %>%
  ## create easy labels
  mutate(tile = gsub("tile", "tile ", tile),
         label = paste0(antibody_label2, "\n", tile, ": rep ", exp_replicate.x,
                        " vs. rep ", exp_replicate.y),
         color_label = case_when(antibody == "001" ~ "#00496F",
                                 antibody == "3570" ~ "#0F85A0",
                                 antibody == "124" ~ "#ED8B00",
                                 antibody == "102" ~ "#DD4124",
                                 antibody == "strep" ~ "#EDD746"))

###############################################################################

## calculate number of scored variants for each antibody
tile_variants <- variant_correlation %>%
  group_by(antibody_label2, label) %>%
  count()

###############################################################################

## get average correlation across all antibodies and replicates
average_correlation_replicates <- variant_correlation %>%
  ## calculate correlation for each antibody-replicate pair
  group_by(label) %>%
  summarise(correlation = cor(score.x, score.y, use = "pairwise.complete.obs")) %>%
  ungroup() %>%
  ## calculate and extract average correlation
  summarise(mean_correlation = mean(correlation)) %>%
  pull() %>%
  ## round to two digits
  round(digits = 2)

###############################################################################

## Fig. S4 - Across replicate variant correlation plots
experimental_replicate_correlations <- variant_correlation %>%
  ## arrange in order of antibody and tile
  arrange(antibody, tile, exp_replicate.x) %>%
  ## nest by labeling variable
  group_by(label) %>%
  nest() %>%
  ## create plots with map2 function
  mutate(plot = map2(data, label, ~ggplot(data = .x,
                                          aes(x = score.x,
                                              y = score.y)) +
                       ## add points, colored by label
                       geom_point(color = .$color_label,
                                  size = 1, alpha = 0.2) +
                       ## add correlation statistic
                       stat_cor(aes(label = paste(after_stat(r.label))),
                                label.y = 1.9, label.x = -0.4, hjust = 0, size = 6 / .pt) +
                       ## add perfect correlation line
                       geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") +
                       ## adjust x and y axes
                       scale_x_continuous(expand = c(0, 0),
                                          limits = c(-0.55, 2.05),
                                          breaks = seq(-0.5, 2, by = 0.5),
                                          labels = function(x) as.character(x)) +
                       scale_y_continuous(expand = c(0, 0),
                                          limits = c(-0.55, 2.05),
                                          breaks = seq(-0.5, 2, by = 0.5),
                                          labels = function(x) as.character(x)) +
                       ## add titles
                       labs(x = paste("Score in replicate", .$exp_replicate.x),
                            y = paste("Score in replicate", .$exp_replicate.y),
                            title = label))) %>%
  ## extract only plots into list
  pull(plot) %>%
  ## arrange list of plots
  wrap_plots()

## save Fig. S4
ggsave(here("outputs", "supp_fig_panels", "S4_replicate_correlations.pdf"),
       plot = experimental_replicate_correlations, device = cairo_pdf,
       height = 150, width = 180, units = "mm")

```

```{r tile correlations}

## calculate variant correlation across tiles
tile_correlations <- scored_variants_sublibrary_average %>%
  ## join with antibody table for easier labeling
  left_join(antibody_table, by = "antibody") %>%
  ## remove WT
  filter(position != 0) %>%
  ## keep only useful columns
  select(wt_aa, position, var_aa, tile, antibody, average_sublibrary_score,
         antibody_label2) %>%
  ## convert tiles to columns and fill in values
  pivot_wider(names_from = tile,
              values_from = average_sublibrary_score) %>%
  ## remove any variants that are missing from all experiments (or cor fails)
  filter(if_any(contains("tile"), complete.cases)) %>%
  ## combine, create comparison title for faceting, add colors
  mutate(tileA = tile2,
         tileA_label = "tile 2",
         tileB = case_when(is.na(tile1) == TRUE ~ tile3,
                           is.na(tile3) == TRUE ~ tile1),
         tileB_label = case_when(is.na(tile1) == TRUE ~ "tile 1",
                                 is.na(tile3) == TRUE ~ "tile 3"),
         comparison = case_when(is.na(tile1) == TRUE ~ "tile 1 vs. tile 2",
                                is.na(tile3) == TRUE ~ "tile 2 vs. tile 3"),
         comparison = paste0(antibody_label2, ":\n", comparison),
         color_label = case_when(antibody == "001" ~ "#00496F",
                                 antibody == "3570" ~ "#0F85A0",
                                 antibody == "124" ~ "#ED8B00",
                                 antibody == "102" ~ "#DD4124",
                                 antibody == "strep" ~ "#EDD746")) %>%
  ## remove unnecessary columns
  select(-tile1, -tile2, -tile3)

###############################################################################

## get average correlation across all antibodies
average_correlation_tiles <- tile_correlations %>%
  ## calculate correlation for each antibody-replicate pair
  group_by(comparison) %>%
  summarise(correlation = cor(tileA, tileB, use = "pairwise.complete.obs")) %>%
  ungroup() %>%
  ## calculate average correlation
  summarise(mean_correlation = mean(correlation)) %>%
  pull() %>%
  ## round to two digits
  round(digits = 2)

###############################################################################

## Fig. S5 - Correlation of shared variants across tiles
shared_tile_correlations <- tile_correlations %>%
  ## arrange in order of antibody and tile
  arrange(antibody, tileB_label) %>%
  ## nest by labeling variable
  group_by(comparison) %>% 
  nest() %>%
  ## create plots with map2 function
  mutate(plot = map2(data, comparison, ~ggplot(data = .x,
                                               aes(x = tileA,
                                                   y = tileB)) +
                       ## add points, colored by label
                       geom_point(color = .$color_label,
                                  size = 1, alpha = 0.2) +
                       ## add correlation statistic
                       stat_cor(aes(label = paste(after_stat(r.label))),
                                label.y = 1.3, label.x = -0.15, hjust = 0,
                                size = 6 / .pt) +
                       ## add perfect correlation line
                       geom_abline(slope = 1, intercept = 0, linetype = "dashed",
                                   color = "black") +
                       ## adjust x and y axes
                       scale_x_continuous(expand = c(0, 0),
                                          limits = c(-0.25, 1.45),
                                          breaks = seq(-0.2, 1.4, by = 0.4),
                                          labels = function(x) as.character(x)) +
                       scale_y_continuous(expand = c(0, 0),
                                          limits = c(-0.25, 1.45),
                                          breaks = seq(-0.2, 1.4, by = 0.4),
                                          labels = function(x) as.character(x)) +
                       ## add titles
                       labs(x = paste("Score in", .$tileA_label),
                            y = paste("Score in", .$tileB_label),
                            title = comparison))) %>%
  ## extract only plots into list
  pull(plot) %>%
  ## arrange list of plots
  wrap_plots(ncol = 5, byrow = FALSE)

## save Fig. S5
ggsave(here("outputs", "supp_fig_panels", "S5_tile_correlations.pdf"),
       plot = shared_tile_correlations, device = cairo_pdf,
       height = 72, width = 180, units = "mm")

```

```{r plot secretion heatmaps}

## create final dataset for plotting heatmaps
scored_variants_final <- scored_variants_average %>%
  ## remove true WT
  filter(variant != "WT")

###############################################################################

## create dataframe to store all antibody heatmaps
    ## use filter(antibody == "XXX") %>% pull(plot) %>% wrap_plots() to plot
    ## must add & theme(legend.position = "bottom") after wrap_plots()
full_heatmaps <- scored_variants_final %>%
  ## add antibody labels
  left_join(antibody_table, by = "antibody") %>%
  ## nest by labeling variable
  group_by(antibody) %>%
  nest() %>%
  ## create plots with map2 function
  mutate(plot = map2(data, antibody, ~heatmap_plot(data = .x) +
                       ## scale x axis to have nice breaks
                       scale_x_continuous(expand = c(0, 0),
                                          limits = c(0.5, 461.5),
                                          breaks = c(1, 50, 100, 150, 200, 250,
                                                     300, 350, 400, 461)) + 
                       ## add title
                       labs(title = .$antibody_label) +
                       ## adjust plot background, legend, title
                       theme(axis.text.x = element_text(angle = 90,
                                                        hjust = 1, vjust = 0.5),
                             legend.key = element_rect(fill = NA, color = "black"),
                             legend.key.height = unit(2.5, "mm"),
                             legend.key.width = unit(20, "mm"),
                             legend.position = "bottom") +
                       ## adjust legends
                       guides(color = guide_legend(title = "Missing",
                                                   title.position = "top", 
                                                   title.hjust = 0.5,
                                                   keywidth = unit(2.5, "mm"),
                                                   override.aes = list(fill = "grey50"),
                                                   order = 3),
                              shape = guide_legend(title = "WT",
                                                   title.position = "top",
                                                   title.hjust = 0.5,
                                                   keywidth = unit(2.5, "mm"),
                                                   override.aes = list(size = 1),
                                                   order = 2),
                              fill = guide_colorbar(title = "Functional score",
                                                    title.position = "top",
                                                    title.hjust = 0.5,
                                                    frame.colour = "grey20",
                                                    ticks.colour = "grey20",
                                                    order = 1))))

```

```{r heavy and light chain heatmaps}

## extract and plot heavy and light chain heatmaps
FIX_ab_secretion_heatmaps <- full_heatmaps %>%
  ## filter heavy and light chain 
  filter(antibody %in% c("102", "124")) %>%
  ## extract only plots into list
  pull(plot) %>%
  ## arrange list of plots
  wrap_plots(ncol = 1, guides = "collect") &
  ## move legend
  theme(legend.position = "bottom")

## save Fig. 2b and 2c
ggsave(here("outputs", "main_fig_panels", "2bc_heavy_and_light_chain_heatmaps.pdf"),
       plot = FIX_ab_secretion_heatmaps, device = cairo_pdf,
       height = 85, width = 180, units = "mm")

```

```{r strep heatmap}

## extract and plot strep II tag heatmaps
strep_secretion_heatmap <- full_heatmaps %>%
  ## filter strep tag 
  filter(antibody == "strep") %>%
  ## extract only plots into list
  pull(plot) %>%
  ## arrange list of plots
  wrap_plots() &
  ## move legend
  theme(legend.position = "bottom")

## save Fig. S2c
ggsave(here("outputs", "supp_fig_panels", "S2c_strep_heatmap.pdf"),
       plot = strep_secretion_heatmap, device = cairo_pdf,
       height = 60, width = 180, units = "mm")

```

```{r quantify number of scored variants}

## annotate scored variants by variant type
scored_variants_average_typed <- scored_variants_average %>%
  ## classify variants
  mutate(type = case_when(position == 0 ~ "WT",
                          var_aa == wt_aa ~ "Synonymous",
                          TRUE ~ "Missense")) %>%
  ## filter missing scores
  filter(!is.na(average_score2)) %>%
  ## add antibody labels
  left_join(antibody_table, by = "antibody") 

###############################################################################

## calculate number of scored variants for each antibody
variants_scored <- scored_variants_average_typed %>%
  ## remove WT
  filter(position != 0) %>%
  ## count and add totals
  tabyl(antibody, type) %>%
  adorn_totals(where = c("row", "col"))

## total number of scored variants, by type (> 1 score)
variants_scored_any <- scored_variants_average_typed %>% 
  ## keep only necessary columns
  select(variant, average_score2, antibody_nonnum, type) %>%
  ## make wide, with antibody scores
  pivot_wider(names_from = antibody_nonnum,
              values_from = average_score2) %>%
  tabyl(type) %>%
  adorn_totals(where = c("row")) 
  

```

```{r secretion score distributions}

## create dataframe to store all score distributions
    ## nested dataframe containing each distribution
    ## use filter(antibody == "XXX") %>% pull(plot) %>% wrap_plots() to plot
all_distributions <- scored_variants_average_typed %>%
  ## adjust axis title label
  mutate(label = case_when(antibody %in% c("001", "3570") ~ 
                             gsub("antibody", "carboxylation score",
                                  antibody_label2),
                           antibody %in% c("102", "124", "strep") ~ 
                             gsub("antibody", "secretion score",
                                  antibody_label2))) %>%
  ## add synonymous thresholds
  left_join(syn_threshold, by = "antibody") %>%
  ## keep only missense and synonymous
  filter(type %in% c("Missense", "Synonymous")) %>%
  ## nest by labeling variable
  group_by(antibody) %>%
  nest() %>%
  ## create plots with map2 function
  mutate(plot = map2(data, antibody, ~ggplot(data = .x) +
                       geom_density(aes(x = average_score2,
                                        y = after_stat(scaled),
                                        fill = type, color = type),
                                    alpha = 0.3, bw = 1/64) + 
                       ## add vertical synonymous threshold line
                       geom_vline(aes(xintercept = unique(lower)),
                                  color = "black", linetype = "dashed") +
                       ## rescale x and y axes, color, and fill
                       scale_x_continuous(expand = c(0, 0),
                                          limits = function(x) {
                                            c(-0.3,
                                              (if(max(x) > 1.3) {1.6}
                                               else {1.3}))
                                              },
                                          breaks = seq(-0.25, 1.5, by = 0.25),
                                          labels = function(x) as.character(x)) +
                       scale_y_continuous(expand = c(0, 0),
                                          limits = c(-0.01, 1.01),
                                          breaks = c(0, 0.25, 0.5, 0.75, 1),
                                          labels = function(x) as.character(x)) +
                       scale_fill_manual(values = paletteer_d("PNWColors::Bay")[c(4, 1)]) +
                       scale_color_manual(values = paletteer_d("PNWColors::Bay")[c(4, 1)]) +
                       ## add label titles
                       labs(x = .$label,
                            y = "Density") +
                       ## adjust legend location and features
                       theme(legend.position = c(0, 1),
                             legend.justification = c(0, 1),
                             legend.key.size = unit(3, "mm"),
                             legend.title = element_blank(),
                             legend.background = element_rect(fill = NA)) +
                       guides(fill = guide_legend(override.aes = list(alpha = 0.7)))))

###############################################################################

## Fig. 2d - heavy chain distribution of scores
heavy_chain_distribution <- all_distributions %>%
  ## keep only heavy chain
  filter(antibody == "102") %>%
  ## extract plot
  pull(plot) %>%
  pluck(1)
  
## save Fig. 2d
ggsave(here("outputs", "main_fig_panels", "2d_heavy_chain_distribution.pdf"),
       plot = heavy_chain_distribution, device = cairo_pdf,
       height = 40, width = 40, units = "mm")

###############################################################################

## Fig. 2e - light chain distribution of scores
light_chain_distribution <- all_distributions %>%
  ## keep only heavy chain
  filter(antibody == "124") %>%
  ## extract plot
  pull(plot) %>%
  pluck(1)
  
## save Fig. 2e
ggsave(here("outputs", "main_fig_panels", "2e_light_chain_distribution.pdf"),
       plot = light_chain_distribution, device = cairo_pdf,
       height = 40, width = 40, units = "mm")

###############################################################################

## Fig. S2d - strep tag distribution of scores
strep_distribution <- all_distributions %>%
  ## keep only heavy chain
  filter(antibody == "strep") %>%
  ## extract plot
  pull(plot) %>%
  pluck(1)
  
## save Fig. S2d
ggsave(here("outputs", "supp_fig_panels", "S2d_strep_distribution.pdf"),
       plot = strep_distribution, device = cairo_pdf,
       height = 40, width = 45, units = "mm")

###############################################################################

## synoymous distribution across all antibodies
compared_distributions <- scored_variants_average_typed %>%
  ## adjust axis title label
  mutate(label = case_when(antibody %in% c("001", "3570") ~ 
                             paste("Carboxylation:", 
                                   gsub("antibody|Carboxylation-sensitive|\n", "",
                                        antibody_label2)),
                           antibody %in% c("102", "124", "strep") ~ 
                             paste("Secretion:", 
                                   gsub("antibody", "",
                                  antibody_label2)))) %>%
  ## remove WT
  filter(type %in% c("Synonymous", "Missense")) %>%
  ## nest
  group_by(type) %>%
  nest() %>%
  ## create plots with map2 function
  mutate(plot = map2(data, type, ~ggplot(data = .x) +
                       geom_density(aes(x = average_score2,
                                        y = after_stat(scaled),
                                        fill = label, color = label),
                                    alpha = 0.3) + 
                       ## add vertical WT line
                       geom_vline(xintercept = 1,
                                  linetype = "dashed",
                                  linewidth = 0.5) +
                       ## rescale x and y axes, color, and fill
                       scale_x_continuous(expand = c(0, 0),
                                          limits = c(-0.3, 1.55),
                                          breaks = seq(-0.25, 1.5, by = 0.25),
                                          labels = function(x) as.character(x)) +
                       scale_y_continuous(expand = c(0, 0),
                                          limits = c(-0.01, 1.02),
                                          breaks = seq(0, 1, by = 0.25),
                                          labels = function(x) as.character(x)) +
                       scale_fill_manual(values = paletteer_d("PNWColors::Bay")) +
                       scale_color_manual(values = paletteer_d("PNWColors::Bay")) +
                       ## add label titles
                       labs(x = "Functional score",
                            y = "Density") +
                       ## adjust legend location and features
                       theme(legend.position = c(0, 1),
                             legend.justification = c(0, 1),
                             legend.title = element_blank(),
                             legend.key.size = unit(3, "mm"),
                             legend.background = element_rect(fill = NA)) +
                       guides(fill = guide_legend(override.aes = list(alpha = 0.7)))))

###############################################################################

## Fig. S1h - synoymous distribution for each antibody
compared_syn <- compared_distributions %>%
  ## keep only synonymous
  filter(type == "Synonymous") %>%
  ## extract plot
  pull(plot) %>%
  pluck(1)

## save Fig. S1h
ggsave(here("outputs", "supp_fig_panels", "S1h_synonymous_distributions.pdf"),
       plot = compared_syn, device = cairo_pdf,
       height = 40, width = 60, units = "mm")

```

```{r heavy chain score validation}

## import heavy chain variant validation data
validation_102 <- list.files(path = here("inputs", "flow",
                                         "validation_102",
                                         "scale"),
                             pattern = "*.csv",
                             recursive = TRUE) %>%
  map_df(~read_flow_path(here("inputs", "flow",
                              "validation_102", 
                              "scale", .))) %>%
  ## clean names
  clean_names() %>%
  ## rename Flowjo default variable names
  rename(ab102 = alexa_fluor_647_a,
         abstrep = alexa_fluor_488_a,
         bfp = pacific_blue_a,
         mcherry = m_cherry_a) %>%
  ## extract source path column to usable variables
  extract(col = source_path, into = c("variant", "replicate", "fluor"),
          regex = ".*-([A-z]+[0-9]{0,3}[A-Z]?)_(rep[0-9])_(.*)\\+.csv") %>%
  ## adjust so that no negative values remain and
  ## add small pseudocount for log plotting to prevent log(0) errors
  mutate(adj_ab102 = ab102 + abs(min(ab102)) + 0.01,
         ## change names and order
         variant = case_when(variant == "control" ~ "Unrecombined",
                             TRUE ~ variant)) %>%
  ## calculate geometric mean for each variant-replicate pair
  group_by(variant, replicate) %>%
  summarise(geomean_adj_ab102 = exp(mean(log(adj_ab102)))) %>%
  ungroup() %>%
  ## calculate average geometric mean and standard error across all replicates
  group_by(variant) %>%
  summarise(geomean_adj_ab102_avg = mean(geomean_adj_ab102),
            se_adj_ab102 = sd(geomean_adj_ab102) / sqrt(n())) %>%
  ungroup() %>%
  ## join with scored data, heavy chain only
  left_join(scored_variants_average %>%
              filter(antibody == "102"), by = "variant") %>%
  ## add score and SE of 0 for negative control (by definition of assay)
  mutate(average_score = case_when(is.na(average_score) ~ 0,
                                   TRUE ~ average_score),
         se_score = case_when(is.na(se_score) ~ 0,
                              TRUE ~ se_score))

###############################################################################

## Fig. 2f - heavy chain score validation plot with individually measured variants
validation_102_plot <- validation_102 %>%
  ## rescale geometric mean and standard error by 1000 for easier plotting
  mutate(geomean_adj_ab102_avg = geomean_adj_ab102_avg / 1000,
         se_adj_ab102 = se_adj_ab102 / 1000) %>%
  ## plot
  ggplot(aes(x = average_score,
             y = geomean_adj_ab102_avg)) + 
  ## add line of best fit and correlation
  stat_cor(aes(label = paste(after_stat(r.label))), hjust = 0,
           label.x.npc = 0.025, label.y.npc = 0.95, size = 6 / .pt) +
  stat_smooth(method = "lm", se = FALSE, fullrange = TRUE,
              color = "black", linetype = "dashed", linewidth = 0.5) +
  ## add standard error for both measurements
  geom_errorbar(aes(ymin = geomean_adj_ab102_avg - se_adj_ab102,
                    ymax = geomean_adj_ab102_avg + se_adj_ab102),
                linewidth = 0.3, width = 0.02) +
  geom_errorbarh(aes(xmin = average_score - se_score,
                     xmax = average_score + se_score),
                 linewidth = 0.3, height = 1) +
  ## add points
  geom_point(size = 1, color = paletteer_d("PNWColors::Starfish")[4]) +
  ## scale axes
  scale_x_continuous(expand = c(0, 0),
                     limits = c(-0.05, 1.3),
                     breaks = seq(0, 1.25, by = 0.25),
                     labels = function(x) as.character(x)) +
  scale_y_continuous(expand = c(0, 0),
                     limits = c(-1, 61),
                     breaks = seq(0, 60, by = 20),
                     labels = function(x) as.character(x)) +
  ## add labels
  labs(x = "Heavy chain secretion score",
       y = "Geometric mean\nheavy chain-Alexa-647 (thousands)")

## save Fig. 2f
ggsave(here("outputs", "main_fig_panels", "2f_heavy_chain_validation.pdf"),
       plot = validation_102_plot, device = cairo_pdf,
       height = 40, width = 40.6, units = "mm")

```

```{r light chain score validation}

## import light chain variant validation data
validation_124 <- list.files(path = here("inputs", "flow",
                                         "validation_124",
                                         "scale"),
                             pattern = "*.csv",
                             recursive = TRUE) %>%
  map_df(~read_flow_path(here("inputs", "flow",
                              "validation_124", 
                              "scale", .))) %>%
  ## clean names
  clean_names() %>%
  ## rename Flowjo default variable names
  rename(ab124 = alexa_647_a,
         abstrep = gfp_a,
         bfp = bfp_a,
         mcherry = m_cherry_yg_a) %>%
  ## extract source path column to usable variables
  extract(col = source_path, into = c("variant", "replicate", "fluor"),
          regex = ".*scale_Rep_.*_([A-z]+[0-9]{0,3}[A-Z]?)_(rep[0-9])_[0-9]{3}_(.*)\\+.csv") %>%
  ## adjust so that no negative values remain and
  ## add small pseudocount for log plotting to prevent log(0) errors
  mutate(adj_ab124 = ab124 + abs(min(ab124)) + 0.01,
         ## change names and order
         variant = case_when(variant == "neg" ~ "Unrecombined",
                             TRUE ~ variant)) %>%
  ## calculate geometric mean for each variant-replicate pair
  group_by(variant, replicate) %>%
  summarise(geomean_adj_ab124 = exp(mean(log(adj_ab124)))) %>%
  ungroup() %>%
  ## calculate average geometric mean and standard error across all replicates
  group_by(variant) %>%
  summarise(geomean_adj_ab124_avg = mean(geomean_adj_ab124),
            se_adj_ab124 = sd(geomean_adj_ab124) / sqrt(n())) %>%
  ungroup() %>%
  ## join with scored data, light chain only
  left_join(scored_variants_average %>%
              filter(antibody == "124"), by = "variant") %>%
  ## add score and SE of 0 for negative control (by definition of assay)
  mutate(average_score = case_when(is.na(average_score) ~ 0,
                                   TRUE ~ average_score),
         se_score = case_when(is.na(se_score) ~ 0,
                              TRUE ~ se_score))

###############################################################################

## Fig. 2g - light chain score validation plot with individually measured variants
validation_124_plot <- validation_124 %>%
  ## rescale geometric mean and standard error by 1000 for easier plotting
  mutate(geomean_adj_ab124_avg = geomean_adj_ab124_avg / 1000,
         se_adj_ab124 = se_adj_ab124 / 1000) %>%
  ## plot
  ggplot(aes(x = average_score,
             y = geomean_adj_ab124_avg)) + 
  ## add line of best fit and correlation
  stat_cor(aes(label = paste(after_stat(r.label))), hjust = 0,
           label.x.npc = 0.025, label.y.npc = 0.95, size = 6 / .pt) +
  stat_smooth(method = "lm", se = FALSE, fullrange = TRUE,
              color = "black", linetype = "dashed", linewidth = 0.5) +
  ## add standard error for both measurements
  geom_errorbar(aes(ymin = geomean_adj_ab124_avg - se_adj_ab124,
                    ymax = geomean_adj_ab124_avg + se_adj_ab124),
                linewidth = 0.3, width = 0.02) +
  geom_errorbarh(aes(xmin = average_score - se_score,
                     xmax = average_score + se_score),
                 linewidth = 0.3, height = 1) +
  ## add points
  geom_point(size = 1, color = lighten(as.list(paletteer_d("PNWColors::Starfish")[1]), amount = 0.2)) +
  ## scale axes
  scale_x_continuous(expand = c(0, 0),
                     limits = c(-0.05, 1.3),
                     breaks = seq(0, 1.25, by = 0.25),
                     labels = function(x) as.character(x)) +
  scale_y_continuous(expand = c(0, 0),
                     limits = c(-1, 76),
                     breaks = seq(0, 75, by = 25),
                     labels = function(x) as.character(x)) +
  ## add labels
  labs(x = "Light chain secretion score",
       y = "Geometric mean\nlight chain-Alexa-647 (thousands)")

## save Fig. 2g
ggsave(here("outputs", "main_fig_panels", "2g_light_chain_validation.pdf"),
       plot = validation_124_plot, device = cairo_pdf,
       height = 40, width = 40.6, units = "mm")

```

```{r strep tag score validation}

## import strep tag variant validation data
validation_strep <- list.files(path = here("inputs", "flow",
                                         "validation_strep",
                                         "scale"),
                             pattern = "*.csv",
                             recursive = TRUE) %>%
  map_df(~read_flow_path(here("inputs", "flow",
                              "validation_strep", 
                              "scale", .))) %>%
  ## clean names
  clean_names() %>%
  ## rename Flowjo default variable names
  rename(abstrep = alexa_fluor_647_a,
         bfp = pacific_blue_a,
         mcherry = m_cherry_a) %>%
  ## extract source path column to useable variables
  extract(col = source_path, into = c("variant", "replicate", "fluor"),
          regex = ".*_293F_([A-z]+[0-9]{0,3}[A-Z]?)_(rep[0-9])_(.*)\\+.csv") %>%
  ## adjust so that no negative values remain and
  ## add small pseudocount for log plotting to prevent log(0) errors
  mutate(adj_abstrep = abstrep + abs(min(abstrep)) + 0.01,
         ## change names
         variant = case_when(variant == "neg" ~ "Unrecombined",
                             TRUE ~ variant)) %>%
  ## calculate geometric mean for each variant-replicate pair
  group_by(variant, replicate) %>%
  summarise(geomean_adj_abstrep = exp(mean(log(adj_abstrep)))) %>%
  ungroup() %>%
  ## calculate average geometric mean and standard error across all replicates
  group_by(variant) %>%
  summarise(geomean_adj_abstrep_avg = mean(geomean_adj_abstrep),
            se_adj_abstrep = sd(geomean_adj_abstrep) / sqrt(n())) %>%
  ungroup() %>%
  ## join with scored data, light chain only
  left_join(scored_variants_average %>%
              filter(antibody == "strep"), by = "variant") %>%
  ## add score and SE of 0 for negative control (by definition of assay)
  mutate(average_score = case_when(is.na(average_score) ~ 0,
                                   TRUE ~ average_score),
         se_score = case_when(is.na(se_score) ~ 0,
                              TRUE ~ se_score))

###############################################################################

## Fig. S2e - strep tag score validation plot with individually measured variants
validation_strep_plot <- validation_strep %>%
  ## rescale geometric mean and standard error by 1000 for easier plotting
  mutate(geomean_adj_abstrep_avg = geomean_adj_abstrep_avg / 1000,
         se_adj_abstrep = se_adj_abstrep / 1000) %>%
  ## plot
  ggplot(aes(x = average_score,
             y = geomean_adj_abstrep_avg)) + 
  ## add line of best fit and correlation
  stat_cor(aes(label = paste(after_stat(r.label))), hjust = 0,
           label.x.npc = 0.025, label.y.npc = 0.95, size = 6 / .pt) +
  stat_smooth(method = "lm", se = FALSE, fullrange = TRUE,
              color = "black", linetype = "dashed", linewidth = 0.5) +
  ## add standard error for both measurements
  geom_errorbar(aes(ymin = geomean_adj_abstrep_avg - se_adj_abstrep,
                    ymax = geomean_adj_abstrep_avg + se_adj_abstrep),
                linewidth = 0.3, width = 0.02) +
  geom_errorbarh(aes(xmin = average_score - se_score,
                     xmax = average_score + se_score),
                 linewidth = 0.3, height = 1) +
  ## add points
  geom_point(size = 1, color = "orange") +
  ## scale axes
  scale_x_continuous(expand = c(0, 0),
                     limits = c(-0.05, 1.55),
                     breaks = seq(0, 1.5, by = 0.5),
                     labels = function(x) as.character(x)) +
  scale_y_continuous(expand = c(0, 0),
                     limits = c(-1, 51),
                     breaks = seq(0, 50, by = 10),
                     labels = function(x) as.character(x)) +
  ## add labels
  labs(x = "Strep tag secretion score",
       y = "Geometric mean strep tag\nantibody fluorescence (thousands)")

## save Fig. S2e
ggsave(here("outputs", "supp_fig_panels", "S2e_strep_validation.pdf"),
       plot = validation_strep_plot, device = cairo_pdf,
       height = 40, width = 40.6, units = "mm")

```

```{r identify epitopes}

## identify epitopes based on ratio of light and heavy chain antibodies
epitopes <- scored_variants_average %>%
  ## keep only required columns
  select(wt_aa, position, var_aa, antibody, average_score) %>%
  ## make antibodies into columns, with scores as values
  pivot_wider(names_from = antibody,
              names_prefix = "ab",
              values_from = average_score) %>%
  ## remove nonsense, synonymous, and missing values
  filter(var_aa != "X") %>%
  filter(var_aa != wt_aa) %>%
  filter(!if_any(contains("ab"), is.na)) %>%
  ## calculate median score at each position
  group_by(position) %>%
  summarise(across(contains("ab"), median)) %>%
  ## comparison to find epitopes
  mutate(epitope = case_when(ab124 > ab102 + 0.33 ~ "Heavy chain epitope",
                             ab124 < ab102 - 0.33 ~ "Light chain epitope",
                             TRUE ~ "Not in epitope"),
         ## add label
         label = case_when(epitope != "Not in epitope" ~ position,
                           TRUE ~ NA_real_),
         ## chain definition
         chain = case_when(position %in% seq(47, 191, by = 1) ~ "Light chain",
                           position %in% seq(227, 461, by = 1) ~ "Heavy chain",
                           TRUE ~ "Signal peptide,\npropeptide, and\nactivation peptide"))

###############################################################################

## create list of heavy chain epitope positions
heavy_chain_epitopes <- epitopes %>%
  ## filter only heavy chain epitopes
  filter(epitope == "Heavy chain epitope") %>%
  pull(position)

## create list of light chain epitope positions
light_chain_epitopes <- epitopes %>%
  ## filter only light chain epitopes
  filter(epitope == "Light chain epitope") %>%
  pull(position)

###############################################################################

## create grey background for defining epitopes
epitope_plotrange <- tibble(x = seq(-1, 2, by = 0.01),
                            ymin = case_when(x - 0.33 < -0.051 ~ -Inf,
                                             TRUE ~ x - 0.33),
                            ymax = case_when(x + 0.33 > 1.2501 ~ Inf,
                                             TRUE ~ x + 0.33))

###############################################################################

## Fig. 2h - median score comparison of heavy and light chain
epitope_plot <- epitopes %>%
  ## plot
  ggplot(aes(x = ab102,
             y = ab124)) +
  ## add background ribbon for positions off diagonal
  geom_ribbon(data = epitope_plotrange,
              aes(x = x,
                  ymin = ymin,
                  ymax = ymax),
              fill = "grey", alpha = 0.3,
              show.legend = FALSE, inherit.aes = FALSE) +
  ## add points
  geom_point(aes(color = chain),
             size = 1, alpha = 0.7) +
  ## add perfect correlation line
  geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
  ## add position labels
  geom_text_repel(aes(label = label), show.legend = FALSE, color = "black",
                  size = 6 / .pt) +
  ## add correlation
  stat_cor(aes(label = paste(after_stat(r.label))),
           label.y = 1.2, label.x = 0, hjust = 0, size = 6 / .pt) +
  ## scale x and y axes, color
  scale_x_continuous(expand = c(0, 0),
                     limits = c(-0.05, 1.25),
                     breaks = seq(0, 1.2, by = 0.2),
                     labels = function(x) as.character(x)) +
  scale_y_continuous(expand = c(0, 0),
                     limits = c(-0.05, 1.25),
                     breaks = seq(0, 1.2, by = 0.2),
                     labels = function(x) as.character(x)) +
  scale_color_manual(values = c("#59629BFF", "#24492EFF", "grey50")) +
  ## add titles
  labs(x = "Heavy chain secretion score",
       y = "Light chain secretion score",
       color = "FIX subunit") +
  theme(legend.key.size = unit(3, "mm"))

## save Fig. 2h
ggsave(here("outputs", "main_fig_panels", "2h_epitopes.pdf"),
       plot = epitope_plot, device = cairo_pdf,
       height = 40, width = 64, units = "mm")

```

```{r light chain epitopes pymol}

## create a list of commands for pymol scripting
pymol_epitope_setup <- c("reinitialize",
                         ## load AlphaFold file
                         paste("load ",
                               here("inputs", "pymol",
                                    "AF-P00740-F1-model_v1.pdb")),
                         ## change background settings
                         "bg_color white",
                         "set opaque_background, 0",
                         ## publication style 
                         "set ray_trace_mode, 1",
                         "set ray_trace_gain, 0.00000",
                         ## define light and heavy chains and color
                         "create LC, resi 47-191",
                         "color grey80, LC",
                         "create HC, resi 227-461",
                         "color grey80, HC",
                         ## define epitope positions and color
                         paste("create LC_epitope, resi",
                               paste(light_chain_epitopes,
                                     collapse = "+")),
                         "show surface, LC_epitope",
                         "color 0x24492E, LC_epitope",
                         paste("create HC_epitope, resi",
                               paste(heavy_chain_epitopes,
                                     collapse = "+")),
                         "show surface, HC_epitope",
                         "color 0x89689D, HC_epitope",
                         ## remove original file with extra molecules
                         "delete AF-P00740-F1-model_v1",
                         ## get standardized viewpoint
                         "set_view (0.295241654, 0.606785297, -0.737999320, 0.842839301, 0.198362395, 0.500274062, 0.449951440, -0.769713342, -0.452856094, -0.000066184, 0.000127479, -329.230163574, -16.785818100, -7.295051098, 17.632047653, 259.568969727, 398.894958496, -20.000000000)")

## rotate 180 degrees
pymol_rotate <- c("rotate y, 180")

## save as pml file (will create both views in Fig. 2i when opened in pymol)
file_conn <- file(here("outputs", "main_fig_panels", "2i_epitopes.pml"))
writeLines(c(pymol_epitope_setup,
             paste("png ", here("outputs", "main_fig_panels", "2i_epitopes_view1.png"),
                   ", height = 4cm, dpi = 300, ray = 1", sep = ""),
             pymol_rotate,
             paste("png ", here("outputs", "main_fig_panels", "2i_epitopes_view2.png"),
                   ", height = 4cm, dpi = 300, ray = 1", sep = "")),
           file_conn)
close(file_conn)

```

```{r antibody epitope zoom pymol}

## create a list of commands for pymol scripting
pymol_epitope_setup <- c("reinitialize",
                         ## load AlphaFold file
                         paste("load ",
                               here("inputs", "pymol",
                                    "AF-P00740-F1-model_v1.pdb")),
                         ## change background settings
                         "bg_color white",
                         "set opaque_background, 0",
                         ## publication style 
                         "set ray_trace_mode, 1",
                         "set ray_trace_gain, 0.00000",
                         ## define antibody and color
                         "select LC, resi 47-191",
                         "color grey80, LC",
                         "select HC, resi 227-461",
                         "color grey80, HC",
                         "select EGF1, resi 93-128",
                         "color 0xC67B6F, EGF1",
                         ## define epitope positions and color
                         paste("select LC_epitope, resi",
                               paste(light_chain_epitopes,
                                     collapse = "+")),
                         "color 0x24492E, LC_epitope",
                         ## define epitope positions to label
                         paste("select labeledLC, resi",
                               paste(c("104-105", "115", "116", "121-124"),
                                     collapse = "+")),
                         ## hide unstructured parts of FIX
                         "hide cartoon, not LC and not HC",
                         ## set up object for epitope
                         "create sLCep, LC_epitope",
                         ## show surface
                         "show surface, sLCep",
                         ## show sticks
                         "show sticks, sLCep",
                         ## hide cartoon of object
                         "hide cartoon, sLCep",
                         ## improve surface
                         "set surface_quality, 1",
                         "alter all, b = 50",
                         "alter all, q = 1",
                         "set gaussian_resolution, 5",
                         "map_new mapLC, gaussian, 1, sLCep, 6",
                         "isosurface surfLC, mapLC",
                         "hide surface, surfLC",
                         "set transparency, 0.5, sLCep",
                         ## create translation to 1 letter AA
                         "one_letter ={'VAL':'V', 'ILE':'I', 'LEU':'L', 'GLU':'E', 'GLN':'Q', 'ASP':'D', 'ASN':'N', 'HIS':'H', 'TRP':'W', 'PHE':'F', 'TYR':'Y', 'ARG':'R', 'LYS':'K', 'SER':'S', 'THR':'T', 'MET':'M', 'ALA':'A', 'GLY':'G', 'PRO':'P', 'CYS':'C'}",
                         ## create labels
                         "label n. CA and labeledLC, '%s%s' % (one_letter[resn], resi)",
                         ## get standardized viewpoint
                         "set_view (0.303145170, 0.604888439, -0.736350536, 0.839591920, 0.195977747, 0.506633937, 0.450766504, -0.771814287, -0.448448181, 0.000000000, 0.000000000, -102.083076477, -26.189998627, -7.981998444, 21.940002441, 80.483078003, 123.683074951, -20.000000000)",
                         ## better font
                         "set label_font_id, 13",
                         "set label_size, 18",
                         ## set label positions
                         "set label_position, (1.5, 3.5, 0), resi 104",
                         "set label_position, (0.75, -1, 0), resi 105",
                         "set label_position, (4, 3, 0), resi 115",
                         "set label_position, (-5, -1, 0), resi 118",
                         "set label_position, (-4.25, 4, 0), resi 121",
                         "set label_position, (-0.5, 3.5, 0), resi 122",
                         "set label_position, (-1, -2.5, 0), resi 123",
                         "set label_position, (2, 7, 0), resi 124")


## save as pml file (will create both views in Fig. 2j when opened in pymol)
file_conn <- file(here("outputs", "main_fig_panels", "2j_light_zoom.pml"))
writeLines(c(pymol_epitope_setup,
             paste("png ", here("outputs", "main_fig_panels", "2j_light_zoom.png"),
                   ", height = 4cm, dpi = 300, ray = 1", sep = "")),
           file_conn)
close(file_conn)

###############################################################################

pymol_epitope_setup <- c("reinitialize",
                         ## load AlphaFold file
                         paste("load ",
                               here("inputs", "pymol",
                                    "AF-P00740-F1-model_v1.pdb")),
                         ## change background settings
                         "bg_color white",
                         "set opaque_background, 0",
                         ## publication style 
                         "set ray_trace_mode, 1",
                         "set ray_trace_gain, 0.00000",
                         ## define antibody and color
                         "select LC, resi 47-191",
                         "color grey80, LC",
                         "select HC, resi 227-461",
                         "color 0xFBDFA2, HC",
                         ## define epitope positions and color
                         paste("select HC_epitope, resi",
                               paste(heavy_chain_epitopes,
                                     collapse = "+")),
                         "color 0x89689D, HC_epitope",
                         ## define epitope positions to label
                         paste("select labeledHC, resi",
                               paste(c("245", "249", "270-274", "295-297"),
                                     collapse = "+")),
                         ## hide unstructured parts of FIX
                         "hide cartoon, not LC and not HC",
                         ## set up object for epitope
                         "create sHCep, HC_epitope",
                         ## show surface
                         "show surface, sHCep",
                         ## show sticks
                         "show sticks, sHCep",
                         ## hide cartoon of object
                         "hide cartoon, sHCep",
                         ## rotate view
                         "rotate y, 180",
                         ## improve surface
                         "set surface_quality, 1",
                         "alter all, b = 50",
                         "alter all, q = 1",
                         "set gaussian_resolution, 5",
                         "map_new mapHC, gaussian, 1, sLCep, 6",
                         "isosurface surfHC, mapHC",
                         "hide surface, surfHC",
                         "set transparency, 0.5, sHCep",
                         ## create translation to 1 letter AA
                         "one_letter ={'VAL':'V', 'ILE':'I', 'LEU':'L', 'GLU':'E', 'GLN':'Q', 'ASP':'D', 'ASN':'N', 'HIS':'H', 'TRP':'W', 'PHE':'F', 'TYR':'Y', 'ARG':'R', 'LYS':'K', 'SER':'S', 'THR':'T', 'MET':'M', 'ALA':'A', 'GLY':'G', 'PRO':'P', 'CYS':'C'}",
                         ## create labels
                         "label n. CA and labeledHC, '%s%s' % (one_letter[resn], resi)",
                         ## get standardized viewpoint
                         "set_view (0.303145170, 0.604888439, -0.736350536, 0.839591920, 0.195977747, 0.506633937, 0.450766504, -0.771814287, -0.448448181, 0.000000000, 0.000000000, -102.083076477, -26.189998627, -7.981998444, 21.940002441, 80.483078003, 123.683074951, -20.000000000)",
                         ## better font
                         "set label_font_id, 13",
                         "set label_size, 18",
                         ## set label positions
                         "set label_position, (-1.5, -2.5, 0), resi 245",
                         "set label_position, (-4, 3, 0), resi 249",
                         "set label_position, (-1.5, -0.2, 0), resi 270",
                         "set label_position, (3, 0.5, 0), resi 271",
                         "set label_position, (1.25, 0.75, 0), resi 272",
                         "set label_position, (0, 3, 0), resi 273",
                         "set label_position, (-0.5, 2, 0), resi 274",
                         "set label_position, (-1, 3, 0), resi 295",
                         "set label_position, (-1, -2.5, 0), resi 296",
                         "set label_position, (-0.4, 1, 20), resi 297")

## save as pml file (will create both views in Fig. 2k when opened in pymol)
file_conn <- file(here("outputs", "main_fig_panels", "2k_light_zoom.pml"))
writeLines(c(pymol_epitope_setup,
             paste("png ", here("outputs", "main_fig_panels", "2k_light_zoom.png"),
                   ", height = 4cm, dpi = 300, ray = 1", sep = "")),
           file_conn)
close(file_conn)

```

```{r epitope distances 3d}

## read in FIX x,y,z coordinates for each alpha carbon in FIX AlphaFold model
fix_coordinates <- read_csv(here("inputs", "coordinates",
                                 "fix_spatial_coordinates.txt"),
                            col_names = c("pLDDT", "x", "y", "z")) %>%
  ## create position
  mutate(position = row_number())

## expand to all combinations
fix_coordinates_expand <- expand_grid(fix_coordinates,
                                      fix_coordinates,
                                      .name_repair = make.unique) %>% 
  ## calculate distance between each pair
  mutate(distance = sqrt((x.1 - x)^2 + (y.1 - y)^2 + (z.1 - z)^2)) %>%
  ## bind to scores
  left_join(epitopes, by = "position")

###############################################################################

## light chain epitopes distance calculations
fix_dist_3d_lc <- fix_coordinates_expand %>%
  ## join with domains
  left_join(domains, by = "position") %>%
  ## remove domains not in processed FIX
  filter(domain_short %in% c("Gla", "EGF1", "EGF2")) %>%
  ## remove positions with poor confidence in position
  filter(pLDDT > 70) %>%
  ## keep only positions in epitope
  filter(position.1 %in% light_chain_epitopes) %>%
  ## find minimum distance to epitope for each position
  group_by(position) %>%
  slice(which.min(distance)) %>%
  ## calculate difference in scores
  mutate(score_diff = ab102 - ab124,
         epitope = case_when(position %in% light_chain_epitopes ~ "light chain",
                             TRUE ~ "non-epitope"))

## create version without known epitopes
fix_dist_3d_lc2 <- fix_dist_3d_lc %>%
  filter(!position %in% light_chain_epitopes,
         !is.na(score_diff))

###############################################################################

## heavy chain epitope distance calculation
fix_dist_3d_hc <- fix_coordinates_expand %>%
  ## join with domains
  left_join(domains, by = "position") %>%
  ## remove domains not in processed FIX
  filter(domain_short %in% c("SPD")) %>%
  ## remove positions with poor confidence in position
  filter(pLDDT > 70) %>%
  ## keep only positions in epitope
  filter(position.1 %in% heavy_chain_epitopes) %>%
  ## and remove positions in heavy chain epitope
  #filter(!position %in% light_chain_epitopes) %>%
  ## find minimum distance to epitope for each position
  group_by(position) %>%
  slice(which.min(distance)) %>%
  ## calculate difference in scores
  mutate(score_diff = ab124 - ab102,
         epitope = case_when(position %in% heavy_chain_epitopes ~ "heavy chain",
                             TRUE ~ "non-epitope"))

## create version without known epitopes
fix_dist_3d_hc2 <- fix_dist_3d_hc %>%
  filter(!position %in% heavy_chain_epitopes,
         !is.na(score_diff))

```

```{r changepoint epitopes}

## change point analysis to detect change in slope between difference and distance
changepoint_model <- list(score_diff ~ 1,
                          ~ 1 + distance)

## fit to distance data
changepoint_fit_lc <- mcp(changepoint_model, fix_dist_3d_lc2)
changepoint_fit_hc <- mcp(changepoint_model, fix_dist_3d_hc2)

## extract changepoint for each
changepoint_lc <- summary(changepoint_fit_lc) %>% pull(mean) %>% pluck(1)
changepoint_hc <- summary(changepoint_fit_hc) %>% pull(mean) %>% pluck(1)

###############################################################################

## identify light chain positions that are less than distance threshold
below_changepoint_lc <- fix_dist_3d_lc2 %>% 
  filter(distance <= changepoint_lc) %>%
  pull(position)

## identify heavy chain positions that are less than distance threshold
below_changepoint_hc <- fix_dist_3d_hc2 %>% 
  filter(distance <= changepoint_hc) %>%
  pull(position)

###############################################################################

## merge changepoint- and manually-identified positions
light_chain_all <- c(light_chain_epitopes, below_changepoint_lc)
heavy_chain_all <- c(heavy_chain_epitopes, below_changepoint_hc)

```

```{r changepoint analysis epitope plots}

## Supplementary Fig. 6a, light chain epitope changepoint analysis
lc_dist_3d <- fix_dist_3d_lc %>%
  ## add adjacent label
  mutate(epitope = case_when(position %in% below_changepoint_lc ~ "light chain-\nadjacent",
                             TRUE ~ epitope)) %>%
  ## plot
  ggplot(aes(x = distance, y = score_diff, color = epitope)) + 
  ## add points
  geom_point(alpha = 0.9, size = 1) +
  ## add changepoint boundary
  geom_vline(aes(xintercept = changepoint_lc), linetype = "dashed") +
  ## add no difference line
  geom_hline(aes(yintercept = 0), linetype = "dashed") +
  ## scale colors and axes
  scale_color_manual(values = c("#24492EFF", "#79BD8BFF", "grey80")) +
  scale_x_continuous(expand = c(0, 0),
                     limits = c(-1, 41),
                     breaks = seq(0, 40, by = 10),
                     labels = function(x) as.character(x)) +
  scale_y_continuous(expand = c(0, 0),
                     limits = c(-0.3, 1.05),
                     breaks = seq(-0.25, 1, by = 0.25),
                     labels = function(x) as.character(x)) +
  ## add labels
  labs(x = "Distance (angstroms)",
       y = "Difference in secretion scores") +
  ## change theme
  theme(legend.title = element_blank(),
        legend.key.size = unit(3, "mm"),
        legend.position = "inside",
        legend.background = element_blank(),
        legend.position.inside = c(0.7, 0.8))

## save Supplementary Fig. 6a
ggsave(here("outputs", "supp_fig_panels", "S6a_light_chain_changepoint.pdf"),
       plot = lc_dist_3d, device = cairo_pdf,
       height = 40, width = 40, units = "mm")

###############################################################################

## Supplementary Fig. S6b, heavy chain epitope changepoint analysis
hc_dist_3d <- fix_dist_3d_hc %>%
  ## add adjacent label
  mutate(epitope = case_when(position %in% below_changepoint_hc ~ "heavy chain-\nadjacent",
                             TRUE ~ epitope)) %>%
  ## plot
  ggplot(aes(x = distance, y = score_diff, color = epitope)) + 
  ## add points
  geom_point(alpha = 0.9, size = 1) +
  ## add changepoint boundary
  geom_vline(aes(xintercept = changepoint_hc), linetype = "dashed") +
  ## add no difference line
  geom_hline(aes(yintercept = 0), linetype = "dashed") +
  ## scale colors and axes
  scale_color_manual(values = c("#59629BFF", "#9198C0FF", "grey80")) +
  scale_x_continuous(expand = c(0, 0),
                     limits = c(-1, 41),
                     breaks = seq(0, 40, by = 10),
                     labels = function(x) as.character(x)) +
  scale_y_continuous(expand = c(0, 0),
                     limits = c(-0.3, 1.05),
                     breaks = seq(-0.25, 1, by = 0.25),
                     labels = function(x) as.character(x)) +
  ## add labels
  labs(x = "Distance (angstroms)",
       y = "Difference in secretion scores") +
  ## change theme
  theme(legend.title = element_blank(),
        legend.key.size = unit(3, "mm"),
        legend.position = "inside",
        legend.background = element_blank(),
        legend.position.inside = c(0.7, 0.8))

## save Supplementary Fig. S6b
ggsave(here("outputs", "supp_fig_panels", "S6b_heavy_chain_changepoint.pdf"),
       plot = hc_dist_3d, device = cairo_pdf,
       height = 40, width = 40, units = "mm")

###############################################################################

## pull out adjacent epitopes
epitope_adjacent <- epitopes %>%
  mutate(callout = case_when(position %in% light_chain_epitopes ~ "light chain",
                             position %in% below_changepoint_lc ~ "light chain-\nadjacent",
                             position %in% heavy_chain_epitopes ~ "heavy chain",
                             position %in% below_changepoint_hc ~ "heavy chain-\nadjacent",
                             TRUE ~ "non-epitope"),
         callout = factor(callout, levels = c("non-epitope",
                                              "light chain",
                                              "light chain-\nadjacent",
                                              "heavy chain",
                                              "heavy chain-\nadjacent")))

## Supplementary Fig. S6c - epitope plot with newly identified positions
epitope_adjacent_plot <- ggplot() +
  ## add background ribbon for positions off diagonal
  geom_ribbon(data = epitope_plotrange,
              aes(x = x,
                  ymin = ymin,
                  ymax = ymax),
              fill = "grey", alpha = 0.7,
              show.legend = FALSE, inherit.aes = FALSE) +
  ## add points
  geom_point(data = epitope_adjacent %>%
               filter(callout == "non-epitope"),
             aes(x = ab102, 
                 y = ab124,
                 color = callout),
             size = 1, alpha = 0.3) +
  ## add points
  geom_point(data = epitope_adjacent %>%
               filter(callout != "non-epitope"),
             aes(x = ab102, 
                 y = ab124, 
                 color = callout),
             size = 1, alpha = 0.9) +
  ## add perfect correlation line
  geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
  ## scale x and y axes, color
  scale_x_continuous(expand = c(0, 0),
                     limits = c(-0.05, 1.25),
                     breaks = seq(0, 1.2, by = 0.2),
                     labels = function(x) as.character(x)) +
  scale_y_continuous(expand = c(0, 0),
                     limits = c(-0.05, 1.25),
                     breaks = seq(0, 1.2, by = 0.2),
                     labels = function(x) as.character(x)) +
  scale_color_manual(values = c("grey50", "#24492EFF", "#79BD8BFF", "#59629BFF", "#9198C0FF")) +
  ## add titles
  labs(x = "Heavy chain secretion score",
       y = "Light chain secretion score") +
  theme(legend.key.size = unit(3, "mm"),
        legend.title = element_blank()) +
  guides(alpha = "none")

## save Supplementary Fig. S6c
ggsave(here("outputs", "supp_fig_panels", "S6c_epitope_adjacent.pdf"),
       plot = epitope_adjacent_plot, device = cairo_pdf,
       height = 40, width = 64, units = "mm")

```

```{r strep vs. chain comparisons}

## update grey background for defining epitopes
epitope_plotrange2 <- tibble(x = seq(-1, 2, by = 0.01),
                             ymin = case_when(x - 0.33 < -0.051 ~ -Inf,
                                              TRUE ~ x - 0.33),
                             ymax = case_when(x + 0.33 > 1.4501 ~ Inf,
                                              TRUE ~ x + 0.33))

###############################################################################

## Fig. S1f - median score comparison of heavy chain and strep tag
heavy_vs_strep <- epitopes %>%
  ## plot
  ggplot(aes(x = abstrep,
             y = ab102,
             color = chain)) +
  ## add background ribbon for positions off diagonal
  geom_ribbon(data = epitope_plotrange2,
              aes(x = x,
                  ymin = ymin,
                  ymax = ymax),
              fill = "grey", alpha = 0.3,
              show.legend = FALSE, inherit.aes = FALSE) +
  ## add points
  geom_point(size = 1) +
  ## add perfect correlation line
  geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
  ## scale x and y axes, color
  scale_x_continuous(expand = c(0, 0),
                     limits = c(-0.05, 1.451),
                     breaks = seq(0, 1.4, by = 0.2),
                     labels = function(x) as.character(x)) +
  scale_y_continuous(expand = c(0, 0),
                     limits = c(-0.05, 1.451),
                     breaks = seq(0, 1.4, by = 0.2),
                     labels = function(x) as.character(x)) +
  scale_color_manual(values = c("#59629BFF", "#24492EFF", "grey50")) +
  ## add titles
  labs(x = "Strep tag secretion score",
       y = "Heavy chain secretion score",
       color = "FIX subunit") +
  theme(legend.key.size = unit(3, "mm"))

## save Fig. S5d
ggsave(here("outputs", "supp_fig_panels", "S5d_heavy_vs_strep.pdf"),
       plot = heavy_vs_strep, device = cairo_pdf,
       height = 40, width = 64, units = "mm")

###############################################################################

## Fig. S5e - median score comparison of light chain and strep tag
light_vs_strep <- epitopes %>%
  ## adjust chain
  mutate(chain = factor(chain,
                        levels = c("Signal peptide,\npropeptide, and\nactivation peptide",
                                   "Heavy chain", "Light chain"))) %>%
  arrange(chain) %>%
  ## plot
  ggplot(aes(x = abstrep,
             y = ab124,
             color = fct_inorder(chain))) +
  ## add background ribbon for positions off diagonal
  geom_ribbon(data = epitope_plotrange2,
              aes(x = x,
                  ymin = ymin,
                  ymax = ymax),
              fill = "grey", alpha = 0.3,
              show.legend = FALSE, inherit.aes = FALSE) +
  ## add points
  geom_point(size = 1) +
  ## add perfect correlation line
  geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
  #facet_wrap(vars(chain)) +
  ## scale x and y axes, color
  scale_x_continuous(expand = c(0, 0),
                     limits = c(-0.05, 1.451),
                     breaks = seq(0, 1.4, by = 0.2),
                     labels = function(x) as.character(x)) +
  scale_y_continuous(expand = c(0, 0),
                     limits = c(-0.05, 1.451),
                     breaks = seq(0, 1.4, by = 0.2),
                     labels = function(x) as.character(x)) +
  scale_color_manual(values = c("grey50", "#59629BFF", "#24492EFF")) +
  guides(color = guide_legend(reverse = TRUE)) +
  ## add titles
  labs(x = "Strep tag secretion score",
       y = "Light chain secretion score",
       color = "FIX subunit") +
  theme(legend.key.size = unit(3, "mm"))

## save Fig. S5e
ggsave(here("outputs", "supp_fig_panels", "S5e_light_vs_strep.pdf"),
       plot = light_vs_strep, device = cairo_pdf,
       height = 40, width = 64, units = "mm")

```

```{r secretion peptide heatmap}

## Fig. 3a - secretion peptide heatmap
secpep_heatmap <- scored_variants_final %>%
  ## filter only positions near secretion peptide (1-28)
  filter(position <= 28) %>%
  ## keep only heavy chain
  filter(antibody == "102") %>%
  ## plot using custom heatmap_plot function
  heatmap_plot(data = .) +
  ## rescale x axis to have nice breaks (will throw warning)
  scale_x_continuous(expand = c(0, 0),
                     limits = c(0.5, 28.5),
                     breaks = c(1, 10, 20, 28)) +
  ## adjust plot background, legend, title
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
        legend.key = element_rect(fill = NA, color = "black"),
        legend.key.height = unit(1.5, "mm"),
        legend.key.width = unit(3, "mm"),
        legend.position = "bottom") +
  ## adjust legends
  guides(color = guide_legend(title = "Missing",
                              title.position = "top", 
                              title.hjust = 0.5,
                              keywidth = unit(1.5, "mm"),
                              override.aes = list(fill = "grey50"),
                              order = 3),
         shape = guide_legend(title = "WT",
                              title.position = "top",
                              title.hjust = 0.5,
                              keywidth = unit(1.5, "mm"),
                              override.aes = list(size = 1),
                              order = 2),
         fill = guide_colorbar(title = "Functional score",
                               title.position = "top",
                               title.hjust = 0.5,
                               frame.colour = "grey20",
                               ticks.colour = "grey20",
                               order = 1))

## save Fig. 3a
ggsave(here("outputs", "main_fig_panels", "3a_secretion_peptide_heatmap.pdf"),
       plot = secpep_heatmap, device = cairo_pdf,
       height = 40, width = 50, units = "mm")

```

```{r signalP}

## read in SignalP 6.0 secretion scores
signalP_scores <- read_delim(here("inputs", "SignalP-6-results",
                                  "prediction_results.txt"),
                             ## skip header lines
                             skip = 2,
                             ## add column names
                             col_names = c("variant", "sp_prediction", "prob_other",
                                           "prob_sp", "cleavage")) %>%
  ## clean up variables for easier use
  mutate(variant = gsub("FA9_", "", variant),
         sp_prediction = case_when(sp_prediction == "SP" ~ "SP6: Secreted",
                                   sp_prediction == "OTHER" ~ "SP6: Not secreted"),
         sp_prediction = factor(sp_prediction,
                                levels = c("SP6: Secreted", "SP6: Not secreted"))) %>%
  ## join with functional scores and synonymous thresholds
  left_join(scored_variants_average, by = "variant") %>%
  left_join(syn_threshold, by = "antibody") %>%
  ## remove missing scores and positions outside of signal peptide
  filter(!is.na(average_score)) %>%
  filter(position <= 28) %>%
  ## keep only required columns
  select(variant, antibody, sp_prediction, average_score, se_score, lower)

###############################################################################

## extract only light chain scores and duplicate for all variants
signalP_scores_lc <- signalP_scores %>% 
  ## remove WT
  filter(variant != "WT") %>% 
  ## identify n/h/c regions
  mutate(position = parse_number(variant),
         region = case_when(position %in% seq(1, 12, by = 1) ~ "N",
                            position %in% seq(13, 24, by = 1) ~ "H",
                            position %in% seq(25, 28, by = 1) ~ "C"),
         region = factor(region, levels = c("All", "N", "H", "C"))) %>%
  ## duplicate data to get all observations group
  bind_rows(signalP_scores) %>%
  replace_na(list(region = "All")) %>%
  ## keep only heavy chain
  filter(antibody == "124")

## get counts for labels
signalP_counts <- signalP_scores_lc %>%
  group_by(region, sp_prediction) %>%
  count()

###############################################################################

## Fig. 3b - SignalP 6.0 predictions
signalP_plot <- signalP_scores_lc %>%
  ## plot
  ggplot(aes(x = region,
             y = average_score,
             group = interaction(region, sp_prediction))) + 
  ## add violin 
  geom_violin(fill = NA, scale = "width", linewidth = 0.3, color = "black",
              position = position_dodge(width = 1), adjust = 2) +
  ## add points
  geom_jitter(aes(color = sp_prediction),
              size = 0.1, alpha = 0.2,
              position = position_jitterdodge(dodge.width = 1, jitter.width = 0.8),
              show.legend = FALSE) +
  ## add boxplot 
  geom_boxplot(color = "black", fill = NA, width = 0.1, linewidth = 0.3,
               outliers = FALSE, position = position_dodge(width = 1)) +
  ## add synonymous threshold line
  geom_hline(aes(yintercept = unique(lower)),
             linetype = "dashed") +
  ## add facet 
  facet_grid(cols = vars(sp_prediction)) +
  ## add n labels
  geom_text(data = signalP_counts, 
            aes(y = 1.25, label = n),
            size = 6 / .pt) +
  ## scale alpha, axis, and colors
  scale_y_continuous(expand = c(0, 0),
                     limits = c(-0.05, 1.3),
                     breaks = seq(0, 1.25, by = 0.25),
                     labels = function(x) as.character(x)) +
  scale_color_manual(values = c("steelblue", "orange")) +
  ## add titles
  labs(x = "Signal peptide subregion",
       y = "Light chain secretion score") +
  ## better legend
  theme(legend.position = "bottom",
        legend.key = element_rect(fill = NA),
        legend.key.size = unit(3, "mm"),
        legend.background = element_rect(fill = NA),
        panel.spacing.x = unit(0, "line"),
        strip.placement = "outside") +
  guides(colour = guide_legend(title.position = "top",
                               title.hjust = 0.5,
                               override.aes = list(alpha = 1)))

## save Fig. 3b
ggsave(here("outputs", "main_fig_panels", "3b_signalP.pdf"),
       plot = signalP_plot, device = cairo_pdf,
       height = 41, width = 60, units = "mm")

###############################################################################

## calculate fraction of variants at each threshold for each region
signalP_correlation <- signalP_scores_lc %>%
  ## identify matches
  mutate(matched = case_when(average_score >= lower & sp_prediction == "Secreted" ~ "Sec-Sec",
                             average_score < lower & sp_prediction == "Secreted" ~ "Sec-NoSec",
                             average_score >= lower & sp_prediction == "Not secreted" ~ "NoSec-Sec",
                             average_score < lower & sp_prediction == "Not secreted" ~ "NoSec-NoSec")) %>%
  tabyl(matched, sp_prediction, region) %>%
  adorn_totals(c("row", "col")) %>%
  adorn_percentages("col") %>%
  adorn_pct_formatting(digits = 2) %>%
  adorn_ns() %>%
  adorn_title()

```

```{r cysteine heatmap}

## extract WT cysteine positions for labeling
cysteine_positions <- scored_variants_final %>%
  filter(wt_aa %in% c("C")) %>%
  select(position) %>% 
  distinct() %>%
  pull()

###############################################################################

## Fig. 3c - cysteine heatmap
cysteine_heatmap <- scored_variants_final %>%
  ## keep only heavy chain
  filter(antibody == "102") %>%
  ## keep only WT cysteines
      ## heatmap_plot will exhibit unexpected behavior if first two positions are
      ## missing, not sure why
  filter(position %in% c(1, 2, cysteine_positions)) %>% 
  ## make position a character vector
  group_by(position) %>%
  mutate(position = as.numeric(cur_group_id())) %>%
  ungroup() %>%
  ## plot using custom heatmap_plot function
  heatmap_plot() +
  ## adjust x axis, removing first two positions
  scale_x_continuous(expand = c(0, 0),
                     limits = c(2.5, 26.5),
                     breaks = seq(3, 26, by = 1),
                     labels = cysteine_positions) +
  ## adjust plot background, legend, title
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
        legend.key = element_rect(fill = NA, color = "black"),
        legend.key.height = unit(1.5, "mm"),
        legend.key.width = unit(3, "mm"),
        legend.position = "bottom") +
  ## adjust legends
  guides(color = guide_legend(title = "Missing",
                              title.position = "top", 
                              title.hjust = 0.5,
                              keywidth = unit(1.5, "mm"),
                              override.aes = list(fill = "grey50"),
                              order = 3),
         shape = guide_legend(title = "WT",
                              title.position = "top",
                              title.hjust = 0.5,
                              keywidth = unit(1.5, "mm"),
                              override.aes = list(size = 1),
                              order = 2),
         fill = guide_colorbar(title = "Functional score",
                               title.position = "top",
                               title.hjust = 0.5,
                               frame.colour = "grey20",
                               ticks.colour = "grey20",
                               order = 1))

## save Fig. 3c
ggsave(here("outputs", "main_fig_panels", "3c_cysteine_heatmap.pdf"),
       plot = cysteine_heatmap, device = cairo_pdf,
       height = 41, width = 55, units = "mm")

```

```{r cysteine VAMP-seq comparisons}

## import all VAMP-seq data from maveDB (collected on 10.22.2022) and
## combine with FIX data
## input:
    ## series of csv files with VAMP-seq scores
    ## scored_variants_average dataframe
## output:
    ## 1: gene - gene name
    ## 2: wt_aa - WT amino acid
    ## 3: position - position in gene
    ## 4: var_aa - variant amino acid
    ## 5: score - average score for variant
vampseq_all <- list.files(path = here("inputs", "maveDB"),
                             pattern = "*.csv",
                             recursive = TRUE) %>%
  ## set vector names as file name, removing .csv
  set_names(., gsub("\\.csv", "", basename(.))) %>%
  ## map across vector to read each file and import source name
  map_dfr(~read_csv(file = here("inputs", "maveDB", .x),
                    skip = 4), .id = "source") %>%
  ## separate source into useful information
  separate(source,
           into = c("first_author", "year", "gene", "method"),
           sep = "_") %>%
  ## separate hgvs_pro into variants
  mutate(hgvs_pro = gsub("p\\.", "", hgvs_pro)) %>%
  extract(hgvs_pro, into = c("wt_aa", "position", "var_aa"),
          regex = "([A-z]+)([0-9]+)([A-z]+|=)") %>%
  ## fix = to be wt_aa
  mutate(var_aa = case_when(var_aa == "=" ~ wt_aa,
                            TRUE ~ var_aa)) %>%
  ## remove NA (WT will be NA), so dictionary continues to work
  filter(!is.na(wt_aa)) %>%
  filter(!is.na(var_aa)) %>%
  ## convert variants to one letter forms for easy comparison to FIX
  mutate(position = as.numeric(position),
         wt_aa = unlist(mget(wt_aa, hash_aa3_to_aa1@.xData)),
         var_aa = unlist(mget(var_aa, hash_aa3_to_aa1@.xData))) %>%
  ## keep only necessary columns
  select(gene, wt_aa, position, var_aa, score) %>%
  ## combine with FIX variants
  bind_rows(scored_variants_average %>%
              ## create gene name
              mutate(gene = "FIX") %>%
              ## keep only heavy chain for simplicity
              filter(antibody == "102") %>%
              ## rename column to match
              rename(score = average_score) %>%
              ## keep only required columns
              select(gene, wt_aa, position, var_aa, score)) %>%
  ## remove missing, synonymous, WT, and X variants
  filter(wt_aa != var_aa,
         !is.na(wt_aa),
         var_aa != "X",
         !is.na(score))

###############################################################################

## Fig. 3d - average effect for WT cysteines
wildtype_cys_scores <- vampseq_all %>%
  ## keep only WT cysteines and scored variants
  filter(wt_aa == "C",
         !is.na(score),
         var_aa != "X") %>%
  ## arrange by mean score
  group_by(gene) %>%
  mutate(mean_score = mean(score)) %>%
  ungroup() %>%
  arrange(mean_score) %>%
  select(-mean_score) %>%
  ## plot
  ggplot(aes(x = fct_inorder(gene),
             y = score)) +
  ## add summary points and SE bars
  stat_summary(fun = mean, geom = "point", size = 0.5) +
  stat_summary(fun.data = mean_se, geom = "errorbar",
               width = 0.1, linewidth = 0.25) +
  ## add statistical test
  stat_pwc(label = "{p.adj.signif}",
           ref.group = "FIX",
           method = "t_test",
           p.adjust.method = "bonferroni",
           remove.bracket = TRUE,
           y.position = 0.8,
           step.increase = 0,
           label.size = 6 / .pt) +
  ## scale y axis
  coord_cartesian(ylim = c(0, 1.02)) +
  scale_y_continuous(expand = c(0, 0),
                     breaks = seq(0, 1, by = 0.2),
                     labels = function(x) as.character(x)) +
  ## add axis titles
  labs(x = "Gene",
       y = "Mean score for variants\nacross wildtype cysteines")

## save Fig. 3d
ggsave(here("outputs", "main_fig_panels", "3d_vampseq_WT.pdf"),
       plot = wildtype_cys_scores, device = cairo_pdf,
       height = 40, width = 60, units = "mm")

###############################################################################

## Fig. 3f - average effect for variant cysteines
variant_cys_scores <- vampseq_all %>%
  ## keep only variant cysteines
  filter(var_aa == "C",
         !is.na(score)) %>%
  ## arrange by mean score
  group_by(gene) %>%
  mutate(mean_score = mean(score)) %>%
  ungroup() %>%
  arrange(mean_score) %>%
  select(-mean_score) %>%
  ## plot
  ggplot(aes(x = fct_inorder(gene),
             y = score)) +
  ## add summary points and SE bars
  stat_summary(fun = mean, geom = "point", size = 0.5) +
  stat_summary(fun.data = mean_se, geom = "errorbar",
               width = 0.1, linewidth = 0.25) +
  ## add statistical test
  stat_pwc(label = "{p.adj.signif}",
           ref.group = "FIX",
           method = "t_test",
           p.adjust.method = "bonferroni",
           remove.bracket = TRUE,
           y.position = 0.8,
           step.increase = 0,
           label.size = 6 / .pt) +
  ## scale y axis
  coord_cartesian(ylim = c(0, 1.02)) +
  scale_y_continuous(expand = c(0, 0),
                     breaks = seq(0, 1, by = 0.2),
                     labels = function(x) as.character(x)) +
  ## add axis titles
  labs(x = "Gene",
       y = "Mean score for\nvariant cysteines")

## save Fig. 3f
ggsave(here("outputs", "main_fig_panels", "3f_vampseq_variant.pdf"),
       plot = variant_cys_scores, device = cairo_pdf,
       height = 40, width = 60, units = "mm")

```

```{r variant effects by amino acid}

## Fig. 3e - plot average effect of all variants by WT amino acid
wt_variant_effect <- vampseq_all %>%
  ## keep only FIX
  filter(gene == "FIX") %>%
  ## make wt_aa a factor for arranging plot
  mutate(wt_aa = factor(wt_aa, levels = c("A", "V", "I", "L", "M", "F", "Y",
                                          "W", "S", "T", "N", "Q", "C", "G",
                                          "P", "R", "H", "K", "D", "E", "X"))) %>%
  ## plot
  ggplot(aes(x = wt_aa,
             y = score,
             fill = wt_aa)) +
  ## add boxplot summary
  geom_boxplot(outlier.shape = NA, position = position_dodge(width = 0.9),
               alpha = 0.7, color = "black",
               notch = TRUE, show.legend = FALSE) +
  ## scale fill and axes
  scale_fill_viridis_d(option = "C", end = 0.8) +
  scale_y_continuous(expand = c(0, 0),
                     limits = c(-0.01, 1.26),
                     breaks = seq(0, 1.25, by = 0.25),
                     labels = function(x) as.character(x)) +
  ## axis titles
  labs(x = "WT amino acid",
       y = "Secretion score")

## save Fig. 3e
ggsave(here("outputs", "main_fig_panels", "3e_FIX_WT_effect.pdf"),
       plot = wt_variant_effect, device = cairo_pdf,
       height = 40, width = 110, units = "mm")

###############################################################################

## Fig. S7a - plot average effect of all variants by WT amino acid
wt_variant_effect_all <- vampseq_all %>%
  ## make wt_aa and gene factors for arranging plot
  mutate(wt_aa = factor(wt_aa, levels = c("A", "V", "I", "L", "M", "F", "Y",
                                          "W", "S", "T", "N", "Q", "C", "G",
                                          "P", "R", "H", "K", "D", "E", "X")),
         gene = factor(gene, levels = c("FIX", "CYP2C9", "NUDT15", "PTEN",
                                        "TPMT", "VKOR")),
         ## add colors
         color_label = case_when(gene == "FIX" ~ "#1D457F",
                                 gene == "CYP2C9" ~ "#535597",
                                 gene == "NUDT15" ~ "#9B5F86",
                                 gene == "PTEN" ~ "#D36E69",
                                 gene == "TPMT" ~ "#EC8851",
                                 gene == "VKOR" ~ "#F2AF4A")) %>%
  ## arrange 
  arrange(wt_aa, gene) %>%
  ## nest by labeling variable
  group_by(wt_aa) %>% 
  nest() %>%
  ## create plots with map2 function
  mutate(plot = map2(data, wt_aa, ~ggplot(data = .x,
                                          aes(x = gene,
                                              y = score)) +
                       ## add boxplot summary
                       geom_boxplot(fill = unique(.$color_label), alpha = 0.7,
                                    outlier.shape = NA,
                                    position = position_dodge(width = 0.9),
                                    linewidth = 0.1, color = "black",
                                    notch = FALSE, show.legend = FALSE) +
                       ## adjust y axis
                       scale_y_continuous(expand = c(0, 0),
                                          limits = c(-0.55, 1.55),
                                          breaks = seq(-0.5, 1.5, by = 0.5),
                                          labels = function(x) as.character(x)) +
                       ## adjust x axis
                       scale_x_discrete(limits = rev) +
                       ## add titles
                       labs(x = "Gene",
                            y = "Secretion score",
                            title = wt_aa) +
                       ## flip axes
                       coord_flip())) %>%
  ## extract only plots into list
  pull(plot) %>%
  ## arrange list of plots
  wrap_plots(ncol = 5)

## save Fig. S7a
ggsave(here("outputs", "supp_fig_panels", "S7a_all_genes_WT_effect.pdf"),
       plot = wt_variant_effect_all, device = cairo_pdf,
       height = 90, width = 175, units = "mm")

###############################################################################

## Fig. 3g - plot average effect of all variants by variant amino acid
variant_effect <- vampseq_all %>%
  ## keep only FIX
  filter(gene == "FIX") %>%
  ## make var_aa a factor for arranging plot
  mutate(var_aa = factor(var_aa, levels = c("A", "V", "I", "L", "M", "F", "Y",
                                            "W", "S", "T", "N", "Q", "C", "G",
                                            "P", "R", "H", "K", "D", "E", "X"))) %>%
  ## plot
  ggplot(aes(x = var_aa,
             y = score,
             fill = var_aa)) +
  ## add boxplot summary
  geom_boxplot(outlier.shape = NA, position = position_dodge(width = 0.9),
               alpha = 0.7, color = "black",
               notch = TRUE, show.legend = FALSE) +
  ## scale fill and axes
  scale_fill_viridis_d(option = "C", end = 0.8) +
  scale_y_continuous(expand = c(0, 0),
                     limits = c(-0.01, 1.26),
                     breaks = seq(0, 1.25, by = 0.25),
                     labels = function(x) as.character(x)) +
  ## axis titles
  labs(x = "Variant amino acid",
       y = "Secretion score")

## save Fig. 3g
ggsave(here("outputs", "main_fig_panels", "3g_FIX_variant_effect.pdf"),
       plot = variant_effect, device = cairo_pdf,
       height = 40, width = 110, units = "mm")

###############################################################################

## Fig. S7b - plot average effect of all variants by variant amino acid
variant_effect_all <- vampseq_all %>%
  ## make wt_aa and gene factors for arranging plot
  mutate(var_aa = factor(var_aa, levels = c("A", "V", "I", "L", "M", "F", "Y",
                                           "W", "S", "T", "N", "Q", "C", "G",
                                           "P", "R", "H", "K", "D", "E", "X")),
         gene = factor(gene, levels = c("FIX", "CYP2C9", "NUDT15", "PTEN",
                                        "TPMT", "VKOR")),
         ## add colors
         color_label = case_when(gene == "FIX" ~ "#1D457F",
                                 gene == "CYP2C9" ~ "#535597",
                                 gene == "NUDT15" ~ "#9B5F86",
                                 gene == "PTEN" ~ "#D36E69",
                                 gene == "TPMT" ~ "#EC8851",
                                 gene == "VKOR" ~ "#F2AF4A")) %>%
  ## arrange 
  arrange(var_aa, gene) %>%
  ## nest by labeling variable
  group_by(var_aa) %>% 
  nest() %>%
  ## create plots with map2 function
  mutate(plot = map2(data, var_aa, ~ggplot(data = .x,
                                           aes(x = gene,
                                               y = score)) +
                       ## add boxplot summary
                       geom_boxplot(fill = unique(.$color_label), alpha = 0.7,
                                    outlier.shape = NA,
                                    position = position_dodge(width = 0.9),
                                    linewidth = 0.1, color = "black",
                                    notch = FALSE, show.legend = FALSE) +
                       ## adjust y axis
                       scale_y_continuous(expand = c(0, 0),
                                          limits = c(-0.55, 1.55),
                                          breaks = seq(-0.5, 1.5, by = 0.5),
                                          labels = function(x) as.character(x)) +
                       ## adjust x axis
                       scale_x_discrete(limits = rev) +
                       ## add titles
                       labs(x = "Gene",
                            y = "Secretion score",
                            title = var_aa) +
                       ## flip axes
                       coord_flip())) %>%
  ## extract only plots into list
  pull(plot) %>%
  ## arrange list of plots
  wrap_plots(ncol = 5)

## save Fig. S7b
ggsave(here("outputs", "supp_fig_panels", "S7b_all_genes_variant_effect.pdf"),
       plot = variant_effect_all, device = cairo_pdf,
       height = 90, width = 175, units = "mm")

```

```{r carboxylation-sensitive FIX Gla pilot}

## import carboxylation-sensitive FIX Gla antibody data with controls
pilot_carboxy001 <- list.files(path = here("inputs", "flow",
                                           "pilot_variants_001",
                                           "scale"),
                             pattern = "*.csv",
                             recursive = TRUE) %>%
  map_df(~read_flow_path(here("inputs", "flow",
                              "pilot_variants_001", 
                              "scale", .))) %>%
  ## clean names
  clean_names() %>%
  ## rename Flowjo default variable names
  rename(ab001 = alexa_647_a,
         bfp = bv450_a,
         mcherry = m_cherry_yg_a) %>%
  ## extract source path column to useable variables
  extract(col = source_path, into = c("variant", "fluor"),
          regex = ".*_stained_(.*)_00[0-9]_(.*)\\+.csv") %>%
  ## adjust so that no negative values remain and
  ## add small pseudocount for log plotting to prevent log(0) errors
  mutate(adj_ab001 = ab001 + abs(min(ab001)) + 0.01,
         ## change names and order
         variant = case_when(variant == "negative-control" ~ "Unrecombined",
                             variant == "FIX-wt" ~ "WT",
                             variant == "FIX-wt-warfarin" ~ "WT + warfarin"),
         variant = factor(variant,
                          levels = c("Unrecombined", "WT", "WT + warfarin")))

###############################################################################

## Fig. S9b - pilot carboxylation-sensitive FIX Gla antibody
pilot_carboxy001_plot <- pilot_carboxy001 %>%
  ggplot(aes(x = adj_ab001,
             y = variant)) +
  ## add density distributions from flow cytometry
  geom_density_ridges(aes(fill = variant,
                          color = variant,
                          height = after_stat(ndensity)),
                      show.legend = FALSE, bandwidth = 1/64, alpha = 0.7,
                      scale = 1) +
  ## adjust fill, x and y axes
  scale_x_log10(limits = c(0.95, 1.05e5),
                breaks = trans_breaks("log10", function(x) 10^x, n = 4),
                labels = trans_format("log10", math_format(10^.x))) +
  scale_y_discrete(limits = rev) +
  scale_fill_manual(values = c("#0D0887", "#6A00A8", "#B12A90")) +
  scale_color_manual(values = c("#0D0887", "#6A00A8", "#B12A90")) +
  ## add labels
  labs(x = "Carboxylation-sensitive\nFIX Gla-Alexa-647") +
  ## adjust grid lines, remove y axis title
  theme(panel.grid.major.y = element_line(color = "grey40"),
        axis.title.y = element_blank())

## save Fig. S9b
ggsave(here("outputs", "supp_fig_panels", "S9b_pilot_001.pdf"),
       plot = pilot_carboxy001_plot, device = cairo_pdf,
       height = 36.5, width = 45, units = "mm")

```

```{r carboxylation-sensitive pan-Gla pilot}

## import carboxylation-sensitive pan-Gla antibody data with controls
pilot_carboxy3570 <- list.files(path = here("inputs", "flow",
                                            "pilot_variants_3570",
                                            "scale"),
                             pattern = "*.csv",
                             recursive = TRUE) %>%
  map_df(~read_flow_path(here("inputs", "flow",
                              "pilot_variants_3570", 
                              "scale", .))) %>%
  ## clean names
  clean_names() %>%
  ## rename Flowjo default variable names
  rename(ab3570 = alexa_647_a,
         bfp = bv450_a,
         mcherry = m_cherry_yg_a) %>%
  ## extract source path column to useable variables
  extract(col = source_path, into = c("variant", "fluor"),
          regex = ".*_stained_(.*)_00[0-9]_(.*)\\+.csv") %>%
  ## adjust so that no negative values remain and
  ## add small pseudocount for log plotting to prevent log(0) errors
  mutate(adj_ab3570 = ab3570 + abs(min(ab3570)) + 0.01,
         ## change names and order
         variant = case_when(variant == "negative-control" ~ "Unrecombined",
                             variant == "FIX-wt" ~ "WT",
                             variant == "FIX-wt-warfarin" ~ "WT + warfarin"),
         variant = factor(variant,
                          levels = c("Unrecombined", "WT", "WT + warfarin")))

###############################################################################

## Fig. S9c - pilot carboxylation-sensitive pan-Gla antibody
pilot_carboxy3570_plot <- pilot_carboxy3570 %>%
  ggplot(aes(x = adj_ab3570,
             y = variant)) +
  ## add density distributions from flow cytometry
  geom_density_ridges(aes(fill = variant,
                          color = variant,
                          height = after_stat(ndensity)),
                      show.legend = FALSE, alpha = 0.7, bandwidth = 1/64,
                      scale = 1) +
  ## adjust fill, x and y axes
  scale_x_log10(limits = c(0.95, 1.05e5),
                breaks = trans_breaks("log10", function(x) 10^x, n = 4),
                labels = trans_format("log10", math_format(10^.x))) +
  scale_y_discrete(limits = rev) +
  scale_fill_manual(values = c("#0D0887", "#6A00A8", "#B12A90")) +
  scale_color_manual(values = c("#0D0887", "#6A00A8", "#B12A90")) +
  ## add labels
  labs(x = "Carboxylation-sensitive\nGla-motif-Alexa-647") +
  ## adjust grid lines, remove y axis title
  theme(panel.grid.major.y = element_line(color = "grey40"),
        axis.title.y = element_blank())

## save Fig. S9c
ggsave(here("outputs", "supp_fig_panels", "S9c_pilot_ab3570.pdf"),
       plot = pilot_carboxy3570_plot, device = cairo_pdf,
       height = 36.5, width = 45, units = "mm")

```

```{r carboxylation heatmaps}

## extract and plot carboxylation-sensitive FIX Gla heatmap
FIX_gla_heatmap <- full_heatmaps %>%
  ## filter strep tag 
  filter(antibody == "001") %>%
  ## extract only plots into list
  pull(plot) %>%
  pluck(1)

## save Fig. 4b
ggsave(here("outputs", "main_fig_panels", "4b_carboxy_sens_FIX_gla_heatmap.pdf"),
       plot = FIX_gla_heatmap, device = cairo_pdf,
       height = 50, width = 180, units = "mm")

###############################################################################

## extract and plot carboxylation-sensitive pan-Gla heatmap
pan_gla_heatmap <- full_heatmaps %>%
  ## filter strep tag 
  filter(antibody == "3570") %>%
  ## extract only plots into list
  pull(plot) %>%
  pluck(1)

## save Fig. 4c
ggsave(here("outputs", "main_fig_panels", "4c_carboxy_sens_pan_gla_heatmap.pdf"),
       plot = pan_gla_heatmap, device = cairo_pdf,
       height = 50, width = 180, units = "mm")

```

```{r zoom in carboxylation maps}

## Fig. S9d-f - zoomed in propeptide heatmaps
propeptide_zoom_heatmap <- scored_variants_final %>%
  ## add antibody labels
  left_join(antibody_table, by = "antibody") %>%
  ## keep only carboxylation and representative secretion antibody
  filter(antibody %in% c("001", "3570", "124")) %>%
  ## keep only WT cysteines
      ## heatmap_plot will exhibit unexpected behavior if first two positions are
      ## missing, not sure why
  filter(position %in% c(1, 2, seq(29, 46, by = 1))) %>%
  ## rearrange carboxylation antibodies to be above secretion
  mutate(antibody = factor(antibody, levels = c("001", "3570", "124"))) %>%
  arrange(antibody) %>%
  ## nest by labeling variable
  group_by(antibody) %>%
  nest() %>%
  ## create plots with map2 function
  mutate(plot = map2(data, antibody, ~heatmap_plot(data = .x) +
                       ## scale x axis to have nice breaks
                       scale_x_continuous(expand = c(0, 0),
                                          limits = c(28.5, 46.5),
                                          breaks = c(29, 35, 40, 46)) + 
                       ## add title
                       labs(title = .$antibody_label) +
                       ## adjust plot background, legend, title
                       theme(axis.text.x = element_text(angle = 90,
                                                        hjust = 1, vjust = 0.5),
                             legend.key = element_rect(fill = NA, color = "black"),
                             legend.key.height = unit(1.5, "mm"),
                             legend.key.width = unit(4.5, "mm"),
                             legend.position = "bottom") +
                       ## adjust legends
                       guides(color = guide_legend(title = "Missing",
                                                   title.position = "top", 
                                                   title.hjust = 0.5,
                                                   keywidth = unit(1.5, "mm"),
                                                   override.aes = list(fill = "grey50"),
                                                   order = 3),
                              shape = guide_legend(title = "WT",
                                                   title.position = "top",
                                                   title.hjust = 0.5,
                                                   keywidth = unit(1.5, "mm"),
                                                   override.aes = list(size = 1),
                                                   order = 2),
                              fill = guide_colorbar(title = "Functional score",
                                                    title.position = "top",
                                                    title.hjust = 0.5,
                                                    frame.colour = "grey20",
                                                    ticks.colour = "grey20",
                                                    order = 1)))) %>%
  ## grab plots
  pull(plot) %>%
  ## plot nicely
  wrap_plots(ncol = 1, guides = "collect", axis_titles = "collect_x") &
  theme(legend.position = "bottom")

## save Fig. S9d-f
ggsave(here("outputs", "supp_fig_panels", "S9def_propeptide_heatmap.pdf"),
       plot = propeptide_zoom_heatmap, device = cairo_pdf,
       height = 140, width = 60, units = "mm")

###############################################################################

## Fig. S9g-i - zoomed in propeptide heatmaps
gla_zoom_heatmap <- scored_variants_final %>%
  ## add antibody labels
  left_join(antibody_table, by = "antibody") %>%
  ## keep only carboxylation and representative secretion antibody
  filter(antibody %in% c("001", "3570", "124")) %>%
  ## keep only WT cysteines
      ## heatmap_plot will exhibit unexpected behavior if first two positions are
      ## missing, not sure why
  filter(position %in% c(1, 2, seq(47, 92, by = 1))) %>%
  ## rearrange carboxylation antibodies to be above secretion
  mutate(antibody = factor(antibody, levels = c("001", "3570", "124"))) %>%
  arrange(antibody) %>%
  ## nest by labeling variable
  group_by(antibody) %>%
  nest() %>%
  ## create plots with map2 function
  mutate(plot = map2(data, antibody, ~heatmap_plot(data = .x) +
                       ## scale x axis to have nice breaks
                       scale_x_continuous(expand = c(0, 0),
                                          limits = c(46.5, 92.5),
                                          breaks = c(47, 60, 70, 80, 92)) + 
                       ## add title
                       labs(title = .$antibody_label) +
                       ## adjust plot background, legend, title
                       theme(axis.text.x = element_text(angle = 90,
                                                        hjust = 1, vjust = 0.5),
                             legend.key = element_rect(fill = NA, color = "black"),
                             legend.key.height = unit(1.5, "mm"),
                             legend.key.width = unit(4.5, "mm"),
                             legend.position = "bottom") +
                       ## adjust legends
                       guides(color = guide_legend(title = "Missing",
                                                   title.position = "top", 
                                                   title.hjust = 0.5,
                                                   keywidth = unit(1.5, "mm"),
                                                   override.aes = list(fill = "grey50"),
                                                   order = 3),
                              shape = guide_legend(title = "WT",
                                                   title.position = "top",
                                                   title.hjust = 0.5,
                                                   keywidth = unit(1.5, "mm"),
                                                   override.aes = list(size = 1),
                                                   order = 2),
                              fill = guide_colorbar(title = "Functional score",
                                                    title.position = "top",
                                                    title.hjust = 0.5,
                                                    frame.colour = "grey20",
                                                    ticks.colour = "grey20",
                                                    order = 1)))) %>%
  ## grab plots
  pull(plot) %>%
  ## plot nicely
  wrap_plots(ncol = 1, guides = "collect", axis_titles = "collect_x") &
  theme(legend.position = "bottom")

###############################################################################

## save Fig. S9g-i
ggsave(here("outputs", "supp_fig_panels", "S9ghi_Gla_heatmap.pdf"),
       plot = gla_zoom_heatmap, device = cairo_pdf,
       height = 140, width = 120, units = "mm")

```

```{r carboxylation distributions}

## Fig. 4d - carboxylation-sensitive FIX Gla distribution of scores
FIX_gla_distribution <- all_distributions %>%
  ## keep only FIX gla
  filter(antibody == "001") %>%
  ## extract plot
  pull(plot) %>%
  pluck(1)
  
## save Fig. 4d
ggsave(here("outputs", "main_fig_panels",
            "4d_carboxy_sens_FIX_gla_distribution.pdf"),
       plot = FIX_gla_distribution, device = cairo_pdf,
       height = 30, width = 45, units = "mm")

###############################################################################

## Fig. 4e - carboxylation-sensitive pan-Gla distribution of scores
pan_gla_distribution <- all_distributions %>%
  ## keep only FIX gla
  filter(antibody == "3570") %>%
  ## extract plot
  pull(plot) %>%
  pluck(1)
  
## save Fig. 4e
ggsave(here("outputs", "main_fig_panels",
            "4e_carboxy_sens_pan_gla_distribution.pdf"),
       plot = pan_gla_distribution, device = cairo_pdf,
       height = 30, width = 45, units = "mm")

```

```{r find carboxylation-sensitive positions}

## identify carboxylation-sensitive positions
## input:
    ## epitopes dataframe
    ## domains dataframe
## output:
    ## 1: position - position of variant
    ## 2: ab001 - median missense score at given position for 001 antibody
    ## 3: ab102 - median missense score at given position for 102 antibody
    ## 4: ab124 - median missense score at given position for 124 antibody
    ## 5: ab3570 - median missense score at given position for 3570 antibody
    ## 6: abstrep - median missense score at given position for strep antibody
    ## 7: epitope - identifier whether position is within antibody epitope
    ## 8: label - numeric position label for epitope plot
    ## 9: chain - identifier of protein chain
    ## 10: domain - FIX domain
    ## 11: domain_short - shorthand form of FIX domain
    ## 12: carboxy_sens - identified carboxylation-sensitive positions
carboxylation <- epitopes %>%
  ## remove epitope positions (falsely low secretion)
  filter(epitope != "Light chain epitope") %>%
  filter(epitope != "Heavy chain epitope") %>%
  ## add FIX protein domains
  left_join(domains, by = "position") %>%
  ## find carboxylation-sensitive positions (FIX-Gla)
  mutate(carboxy_sens = case_when(ab124 > ab001 + 0.2 ~
                                    "Carboxylation-sensitive",
                                  ab124 < ab001 + 0.2 ~
                                    "Not carboxylation-sensitive"),
         ## find carboxylation-senstive positions (pan-Gla)
         carboxy_sens2 = case_when(ab124 > ab3570 + 0.2 ~
                                     "Carboxylation-sensitive",
                                   ab124 < ab3570 + 0.2 ~
                                     "Not carboxylation-sensitive"),
         ## add label for FIX-Gla
         label = case_when(carboxy_sens == "Carboxylation-sensitive" ~ position,
                           TRUE ~ NA_real_),
         ## add label for pan-Gla
         label2 = case_when(carboxy_sens2 == "Carboxylation-sensitive" ~ position,
                           TRUE ~ NA_real_))

###############################################################################

## get all scores for positions with low carboxylation scores
carboxylation_FIXgla_low <- carboxylation %>%
  ## keep only low carboxylation positions in Gla domain
  filter(carboxy_sens == "Carboxylation-sensitive",
         domain == "Gla")

###############################################################################

## Fig. 4f - carboxylation vs. secretion plot, FIX-Gla
carboxy001_vs_secretion <- carboxylation %>% 
  ## plot
  ggplot(aes(x = ab124,
             y = ab001)) +
  ## add points
  geom_point(aes(color = domain),
             size = 1, alpha = 0.7) +
  ## add threshold line
  geom_abline(slope = 1, intercept = -0.2, linetype = "dashed", color = "black") +
  ## add text labels for low carboxylation positions
  geom_text_repel(aes(label = label),
                  color = "black", size = 6 / .pt,
                  box.padding = 0.005) +
  ## add correlation
  stat_cor(aes(label = paste(after_stat(r.label))),
           label.y = 1.2, label.x = 1.05, hjust = 0, size = 6 / .pt) +
  ## scale x and y axes, color
  scale_x_continuous(expand = c(0, 0),
                     limits = c(-0.05, 1.25),
                     breaks = seq(0, 1.2, by = 0.2),
                     labels = function(x) as.character(x)) +
  scale_y_continuous(expand = c(0, 0),
                     limits = c(-0.05, 1.25),
                     breaks = seq(0, 1.2, by = 0.2),
                     labels = function(x) as.character(x)) +
  scale_color_manual(values = paletteer_d("PNWColors::Sunset")) +
  ## add titles
  labs(x = "Light chain secretion score",
       y = "Carboxylation-sensitive\nFIX Gla carboxylation score") +
  ## move legend inside plot
  theme(legend.position = c(-0.01, 1.03),
        legend.justification = c(0, 1),
        legend.key.size = unit(3, "mm"),
        legend.title = element_blank(),
        legend.background = element_rect(fill = NA))

## save Fig. 4f
ggsave(here("outputs", "main_fig_panels", "4f_carboxy_sens_positions.pdf"),
       plot = carboxy001_vs_secretion, device = cairo_pdf,
       height = 57.2, width = 70, units = "mm")

```

```{r variant mechanism effects}

variant_classification <- scored_variants_average %>%
  ## keep only missense
  #filter(wt_aa != var_aa & var_aa != "X") %>%
  ## keep only useful columns
  select(variant, position, antibody, average_score) %>% 
  ## make wide
  pivot_wider(names_from = antibody,
              names_prefix = "ab",
              values_from = average_score) %>%
  ## remove variants with all missing data (not synthesized)
  filter(if_all(contains("ab"), ~ !is.na(.))) %>%
  ## classify variant mechanisms of effect
  mutate(outcome = case_when(
    ## light chain epitopes
    position %in% light_chain_epitopes &
      ab102 >= syn_threshold_wide %>% pull(lower_102) &
      ab001 >= syn_threshold_wide %>% pull(lower_001) &
      ab3570 >= syn_threshold_wide %>% pull(lower_3570)
    ~ "WT-like",
    position %in% light_chain_epitopes & 
      ab102 < syn_threshold_wide %>% pull(lower_102)
    ~ "secretion-deficient",
    position %in% light_chain_epitopes &
      ab102 >= syn_threshold_wide %>% pull(lower_102) &
      ab001 < syn_threshold_wide %>% pull(lower_001)
    ~ "carboxylation-deficient",
    position %in% light_chain_epitopes &
      ab102 >= syn_threshold_wide %>% pull(lower_102) &
      ab3570 < syn_threshold_wide %>% pull(lower_3570)
    ~ "carboxylation-deficient",
    ## heavy chain epitopes
    position %in% heavy_chain_epitopes &
      ab124 >= syn_threshold_wide %>% pull(lower_124) &
      ab001 >= syn_threshold_wide %>% pull(lower_001) &
      ab3570 >= syn_threshold_wide %>% pull(lower_3570)
    ~ "WT-like",
    position %in% heavy_chain_epitopes & 
      ab124 < syn_threshold_wide %>% pull(lower_124)
    ~ "secretion-deficient",
    position %in% heavy_chain_epitopes &
      ab124 >= syn_threshold_wide %>% pull(lower_124) &
      ab001 < syn_threshold_wide %>% pull(lower_001)
    ~ "carboxylation-deficient",
    position %in% heavy_chain_epitopes &
      ab124 >= syn_threshold_wide %>% pull(lower_124) &
      ab3570 < syn_threshold_wide %>% pull(lower_3570)
    ~ "carboxylation-deficient",
    ## not in epitopes
    ab124 >= syn_threshold_wide %>% pull(lower_124) &
      ab102 >= syn_threshold_wide %>% pull(lower_102) &
      ab001 >= syn_threshold_wide %>% pull(lower_001) &
      ab3570 >= syn_threshold_wide %>% pull(lower_3570) 
    ~ "WT-like",
    ab124 < syn_threshold_wide %>% pull(lower_124) &
      ab102 < syn_threshold_wide %>% pull(lower_102)
    ~ "secretion-deficient",
    ab001 < pmin(ab124, ab102) - 0.2 &
      ab001 < syn_threshold_wide %>% pull(lower_001)
    ~ "carboxylation-deficient",
    ab001 >= pmin(ab124, ab102) - 0.2 &
    ab124 < syn_threshold_wide %>% pull(lower_124) &
      ab001 < syn_threshold_wide %>% pull(lower_001)
    ~ "secretion-deficient",
    ab001 >= pmin(ab124, ab102) - 0.2 &
    ab102 < syn_threshold_wide %>% pull(lower_102) &
      ab001 < syn_threshold_wide %>% pull(lower_001)
    ~ "secretion-deficient",
    ab001 >= syn_threshold_wide %>% pull(lower_001) &
      ab102 >= syn_threshold_wide %>% pull(lower_102) &
      ab124 < syn_threshold_wide %>% pull(lower_124) &
      abstrep >= syn_threshold_wide %>% pull(lower_strep)
    ~ "WT-like",
    ab001 >= syn_threshold_wide %>% pull(lower_001) &
      ab102 < syn_threshold_wide %>% pull(lower_102) &
      ab124 >= syn_threshold_wide %>% pull(lower_124) &
      abstrep >= syn_threshold_wide %>% pull(lower_strep)
    ~ "WT-like",
    ab001 >= syn_threshold_wide %>% pull(lower_001) &
      ab102 >= syn_threshold_wide %>% pull(lower_102) &
      ab124 < syn_threshold_wide %>% pull(lower_124) &
      abstrep < syn_threshold_wide %>% pull(lower_strep)
    ~ "secretion-deficient",
    ab001 >= syn_threshold_wide %>% pull(lower_001) &
      ab102 < syn_threshold_wide %>% pull(lower_102) &
      ab124 >= syn_threshold_wide %>% pull(lower_124) &
      abstrep < syn_threshold_wide %>% pull(lower_strep)
    ~ "secretion-deficient",
    ab001 < syn_threshold_wide %>% pull(lower_001) &
      ab102 >= syn_threshold_wide %>% pull(lower_102) &
      ab124 < syn_threshold_wide %>% pull(lower_124) &
      abstrep >= syn_threshold_wide %>% pull(lower_strep) &
      ab001 >= pmin(ab124, ab102) - 0.2 
    ~ "secretion-deficient",
    ab001 < syn_threshold_wide %>% pull(lower_001) &
      ab102 < syn_threshold_wide %>% pull(lower_102) &
      ab124 >= syn_threshold_wide %>% pull(lower_124) &
      abstrep >= syn_threshold_wide %>% pull(lower_strep) &
      ab001 >= pmin(ab124, ab102) - 0.2 
    ~ "secretion-deficient",
    ab001 < syn_threshold_wide %>% pull(lower_001) &
      ab102 >= syn_threshold_wide %>% pull(lower_102) &
      ab124 >= syn_threshold_wide %>% pull(lower_124) 
    ~ "carboxylation-deficient",
    ab3570 < syn_threshold_wide %>% pull(lower_3570) &
      ab102 >= syn_threshold_wide %>% pull(lower_102) &
      ab124 >= syn_threshold_wide %>% pull(lower_124) 
    ~ "carboxylation-deficient"))

## count all missense variant classifications
variant_class_count_all <- variant_classification %>%
  ## keep only missense
  filter(str_sub(variant, 1, 1) != str_sub(variant, -1, -1),
         str_sub(variant, -1, -1) != "X") %>%
  tabyl(outcome) %>%
  adorn_pct_formatting(digits = 1) %>%
  adorn_totals("row") 

carboxy_count <- variant_classification %>%
  filter(str_sub(variant, 1, 1) != str_sub(variant, -1, -1),
         str_sub(variant, -1, -1) != "X",
         position %in% seq(29, 92, by = 1)) %>%
  tabyl(outcome) %>% 
  adorn_pct_formatting(digits = 1) %>%
  adorn_totals("row")
  
```

```{r pymol carboxylation Gla domain - FIX-specific}

## extract carboxylated glutamates in the Gla domain
carboxy_gluts <- carboxylation %>%
  ## keep only Gla domain variants
  filter(domain == "Gla") %>%
  ## fill in missing positions
  complete(position = seq(47, 92, by = 1)) %>%
  ## join with WT FIX to get WT amino acids
  left_join(wt_FIX_aa, by = "position") %>%
  ## keep only WT glutamates
  filter(wt_aa == "E") %>%
  ## extract
  pull(position)

## create list of positions by median score for coloring pymol structure
## input:
    ## carboxylation dataframe
carboxylation_ratios <- carboxylation %>%
  ## keep only Gla domain variants
  filter(domain == "Gla") %>%
  ## fill in missing positions
  complete(position = seq(47, 92, by = 1))  %>%
  ## adjust ab001 values to min 0
  mutate(ab001 = case_when(ab001 < 0 ~ 0,
                           TRUE ~ ab001),
         ## calculate ratio
         carboxy_ratio = ab001 / ab124,
         ## add color
         color_position = case_when(carboxy_ratio < 0.2 ~ "0x0000FF",
                                    carboxy_ratio >= 0.2 & carboxy_ratio < 0.4 ~ "0x3333FF",
                                    carboxy_ratio >= 0.4 & carboxy_ratio < 0.6 ~ "0x6666FF",
                                    carboxy_ratio >= 0.6 & carboxy_ratio < 0.8 ~ "0x9999FF",
                                    carboxy_ratio >= 0.8 & carboxy_ratio < 1.0 ~ "0xFFFFFF",
                                    carboxy_ratio >= 1.0 ~ "0xFF9999",
                                    is.na(carboxy_ratio) == TRUE ~ "grey50"),
         ## adjust position to match pymol
         position_new = position - 46,
         ## write coloring command
         command = paste0("color ", color_position, ", resi ", position_new))

## extract list of commands
carboxy_positions <- carboxylation_ratios %>%
  pull(command)

## create a list of commands for pymol scripting
pymol_carboxy_setup <- c("reinitialize",
                         ## load Huang 2004 FIX Gla structure (with 10C12 antibody)
                         "fetch 1nl0",
                         ## change background settings
                         "bg_color white",
                         "set opaque_background, 0",
                         ## publication style 
                         "set ray_trace_mode, 1",
                         "set ray_trace_gain, 0.00000",
                         ## select FIX Gla domain and remove bound antibody
                         "select FIX, chain G",
                         "remove not FIX",
                         ## select carboxylated residues and show stick side chain
                         "select gla, resn cgu",
                         "show sticks, gla and not (name c+n)",
                         ## show disulfide bridge and color
                         "show sticks, resn cys and not (name c+n)",
                         ## color positions by median ratio of carboxylation-sensitive
                         ## antibody to light chain antibody in 0.2 increments
                         paste(paste0(carboxy_positions, collapse = "\n")),
                         ## color side chains
                         "color red, gla and elem o",
                         "color yellow, resn cys and not (name ca+c+n+o)",
                         ## remove waters
                         "remove resn hoh",
                         ## color and resize Ca ions
                         ## Pauling 1960 - Ca VI in crystal = 1.14 angstroms
                         "color teal, elem ca",
                         "alter elem ca, vdw = 1.14",
                         "rebuild",
                         ## get standardized viewpoint
                         "set_view (0.111095250, 0.758084536, -0.642623723, 0.989960790, -0.027554817, 0.138635233, 0.087390937, -0.651574373, -0.753535390, 0.000006676, -0.000007272, -108.327758789, 47.139995575, 4.157243252, -5.074407101, 89.470329285, 127.185188293, -20.000000000)")

## save as pml file (will create both views in Fig. 4j when opened in pymol)
file_conn <- file(here("outputs", "main_fig_panels", "4j_carboxylation_ratio.pml"))
writeLines(c(pymol_carboxy_setup,
             paste("png ", here("outputs", "main_fig_panels", "4j_carboxylation_ratio.png"),
                   ", height = 5.5cm, dpi = 600, ray = 1", sep = "")),
           file_conn)
close(file_conn)

```

```{r pymol carboxylation Gla domain - Gla motif}

## create list of positions by median score for coloring pymol structure
## input:
    ## carboxylation dataframe
gla_ratios <- carboxylation %>%
  ## keep only Gla domain variants
  filter(domain == "Gla") %>%
  ## fill in missing positions
  complete(position = seq(47, 92, by = 1))  %>%
  ## adjust ab001 values to min 0
  mutate(ab3570 = case_when(ab3570 < 0 ~ 0,
                           TRUE ~ ab3570),
         ## calculate ratio
         carboxy_ratio = ab3570 / ab102,
         ## add color
         color_position = case_when(carboxy_ratio < 0.2 ~ "0x0000FF",
                                    carboxy_ratio >= 0.2 & carboxy_ratio < 0.4 ~ "0x3333FF",
                                    carboxy_ratio >= 0.4 & carboxy_ratio < 0.6 ~ "0x6666FF",
                                    carboxy_ratio >= 0.6 & carboxy_ratio < 0.8 ~ "0x9999FF",
                                    carboxy_ratio >= 0.8 & carboxy_ratio < 1.2 ~ "0xFFFFFF",
                                    carboxy_ratio >= 1.2 ~ "0xFF9999",
                                    is.na(carboxy_ratio) == TRUE ~ "grey50"),
         ## adjust position to match pymol
         position_new = position - 46,
         ## write coloring command
         command = paste0("color ", color_position, ", resi ", position_new))

## extract list of commands
gla_positions <- gla_ratios %>%
  pull(command)

## create a list of commands for pymol scripting
pymol_gla_setup <- c("reinitialize",
                     ## load Huang 2004 FIX Gla structure (with 10C12 antibody)
                     "fetch 1nl0",
                     ## change background settings
                     "bg_color white",
                     "set opaque_background, 0",
                     ## publication style 
                     "set ray_trace_mode, 1",
                     "set ray_trace_gain, 0.00000",
                     ## select FIX Gla domain and remove bound antibody
                     "select FIX, chain G",
                     "remove not FIX",
                     ## select carboxylated residues and show stick side chain
                     "select gla, resn cgu",
                     "show sticks, gla and not (name c+n)",
                     ## show disulfide bridge and color
                     "show sticks, resn cys and not (name c+n)",
                     ## color positions by median ratio of carboxylation-sensitive
                     ## antibody to light chain antibody in 0.2 increments
                     paste(paste0(gla_positions, collapse = "\n")),
                     ## color side chains
                     "color red, gla and elem o",
                     "color yellow, resn cys and not (name ca+c+n+o)",
                     ## remove waters
                     "remove resn hoh",
                     ## color and resize Ca ions
                     ## Pauling 1960 - Ca VI in crystal = 1.14 angstroms
                     "color teal, elem ca",
                     "alter elem ca, vdw = 1.14",
                     "rebuild",
                     ## get standardized viewpoint
                     "set_view (0.111095250, 0.758084536, -0.642623723, 0.989960790, -0.027554817, 0.138635233, 0.087390937, -0.651574373, -0.753535390, 0.000006676, -0.000007272, -108.327758789, 47.139995575, 4.157243252, -5.074407101, 89.470329285, 127.185188293, -20.000000000)",
                     ## rotate 90
                     "rotate y, 180")

## save as pml file (will create both views in Fig. 4j when opened in pymol)
file_conn <- file(here("outputs", "supp_fig_panels", "Sxa_gla_ratio.pml"))
writeLines(c(pymol_gla_setup,
             paste("png ", here("outputs", "supp_fig_panels", "Sxa_gla_ratio.png"),
                   ", height = 5.5cm, dpi = 600, ray = 1", sep = "")),
           file_conn)
close(file_conn)

```

```{r EAHAD}

## read in EAHAD FIX database of missense and nonsense variants
eahad <- read_csv(here("inputs", "eahad", "231009_EAHAD_variants_bycase.csv")) %>%
  ## extract WT and variant amino acids
  mutate(wt_aa3 = str_sub(Protein_change, start = 4L, end = 6L),
         mut_aa3 = case_when(Variant_effect == "Missense" ~ str_sub(Protein_change,
                                                                    start = -4L,
                                                                    end = -2L),
                             Variant_effect == "Nonsense" ~ "Ter"),
         ## clean bad variants
         mut_aa3 = gsub("HIs", "His", mut_aa3),
         ## replace <1 (severe) with 0.1 for plotting purposes
         FIX_activity = gsub("<1", "0.1", FIX_activity),
         FIX_activity = gsub("-", NA, FIX_activity),
         FIX_antigen = gsub("<1", "0.1", FIX_antigen),
         FIX_antigen = gsub("-", NA, FIX_antigen))

###############################################################################

## remove variants without range values for antigen and activity
eahad_no_range <- eahad %>%
  ## select only variants without range
  filter(!grepl("to", FIX_activity) & !grepl("to", FIX_antigen)) %>%
  ## make activity and antigen numeric
  mutate(FIX_activity = as.numeric(FIX_activity),
         FIX_antigen = as.numeric(FIX_antigen))

## split off variants with range values for antigen or activity
eahad_ranged_values <- eahad %>%
  ## keep only variants with ranges
  filter(grepl("to", FIX_activity) | grepl("to", FIX_antigen)) %>%
  ## split antigen and activity each into 2 columns, one for each measurement in range
  ## will throw NA for antigen/activity that is not reported as range, this is fine
  separate(FIX_antigen, into = c("antigen1", "antigen2"),
           sep = " to ", extra = "drop", convert = TRUE) %>%
  separate(FIX_activity, into = c("activity1", "activity2"),
           sep = " to ", extra = "drop", convert = TRUE) %>%
  ## calculate rowwise mean
  rowwise() %>%
  ## calculate mean of columns
  mutate(FIX_antigen = mean(c(antigen1, antigen2), na.rm = TRUE),
         FIX_activity = mean(c(activity1, activity2), na.rm = TRUE)) %>%
  ## remove unnecessary columns
  select(-starts_with("antigen"), -starts_with("activity"))

###############################################################################

## re-join eahad data and convert 3 letter abbreviations to 1 letter
eahad_cleaned <- bind_rows(eahad_no_range, eahad_ranged_values) %>%
  ## turn into 1 aa abbreviations
  mutate(mut_aa = unlist(mget(mut_aa3, hash_aa3_to_aa1@.xData)),
         wt_aa = unlist(mget(wt_aa3, hash_aa3_to_aa1@.xData)),
         variant = paste0(wt_aa, Position, mut_aa)) %>%
  ## remove data associated with other causative variants
  filter(!grepl("Also has p.", Comments)) %>%
  ## calculate activity to antigen ratio
  mutate(ratio = FIX_activity / FIX_antigen)

## Count unique EAHAD variants with actvity, antigen, and severity
eahad_num_variants <- eahad_cleaned %>%
  ## remove variants with missing information
  filter(!if_all(c(FIX_activity, FIX_antigen, Severity), is.na)) %>%
  filter(!is.na(variant),
         !grepl("X", variant)) %>%
  ## keep only necessary columns
  select(variant) %>%
  ## keep only unique variants
  distinct() %>%
  ## count
  count() %>%
  pull()

## create eahad output table 
eahad_table <- eahad_cleaned  %>%
  ## remove variants with missing information
  filter(!if_all(c(FIX_activity, FIX_antigen, Severity), is.na)) %>%
  filter(!is.na(variant),
         !grepl("X", variant)) %>%
  ## keep columns
  select(variant, Protein_change, Position, Legacy_position, FIX_activity, 
         FIX_antigen, Severity) %>%
  ## fix severity
  mutate(Severity = case_when(Severity == "-" | is.na(Severity) ~ "Missing",
                              TRUE ~ str_to_sentence(Severity)))

## get severities with counts
eahad_table_severity <- eahad_table %>%
  ## count
  count(variant, Severity) %>% 
  ## make wide
  pivot_wider(names_from = Severity,
              values_from = n,
              values_fill = 0) %>%
  mutate(n_severity = Mild + Moderate + Severe) %>%
  ## join with agreed severity
  left_join(eahad_severity %>%
              select(variant, Severity), by = "variant") %>%
  ## fix severity column
  mutate(Severity = case_when(if_all(where(is.numeric), ~ . == 0) ~ "Missing",
                              if_any(where(is.numeric), ~ . > 0) & is.na(Severity) ~ "Multiple",
                              TRUE ~ Severity)) %>%
  rename("Assigned severity" = Severity)

## get antigens with counts
eahad_table_antigen <- eahad_table %>%
  ## remove NA
  filter(!is.na(FIX_antigen),
         !is.nan(FIX_antigen)) %>%
  ## count and find min, max, mean
  group_by(variant) %>%
  summarise(n_antigen = n(),
            min_FIX_antigen = min(FIX_antigen),
            max_FIX_antigen = max(FIX_antigen),
            mean_FIX_antigen = mean(FIX_antigen)) %>%
  ungroup() %>%
  ## sub <1 for 0.1 (reversing what we did for analysis)
  mutate(across(where(is.numeric), ~case_when(. == 0.1 ~ "<1",
                                              TRUE ~ as.character(round(., digits = 1)))))

## get activities with counts
eahad_table_activity <- eahad_table %>%
  ## remove NA
  filter(!is.na(FIX_activity),
         !is.nan(FIX_activity)) %>%
  ## count and find min, max, mean
  group_by(variant) %>%
  summarise(n_activity = n(),
            min_FIX_activity = min(FIX_activity),
            max_FIX_activity = max(FIX_activity),
            mean_FIX_activity = mean(FIX_activity)) %>%
  ungroup() %>%
  ## sub <1 for 0.1 (reversing what we did for analysis)
  mutate(across(where(is.numeric), ~case_when(. == 0.1 ~ "<1",
                                              TRUE ~ as.character(round(., digits = 1)))))

## join all tables
eahad_table_final <- eahad_table %>% 
  ## keep only useful values
  select(variant) %>%
  distinct() %>%
  ## join with tables
  left_join(eahad_table_severity, by = "variant") %>%
  left_join(eahad_table_antigen, by = "variant") %>%
  left_join(eahad_table_activity, by = "variant")


## Table S9
write_csv(eahad_table_final, here("outputs", "tables", "S9_EAHAD_variants_case_data.csv"))

```

```{r EAHAD antigen}

## isolate variants with antigen data
eahad_antigen <- eahad_cleaned %>%
  ## remove no antigen variants
  filter(!is.na(FIX_antigen)) %>%
  ## remove C396S, as variant is a somatic chimera (Taylor, et al, 1991)
  filter(variant != "C396S") %>%
  ## remove variants in secretion antibody epitopes 
  filter(!Position %in% c(heavy_chain_epitopes, light_chain_epitopes)) %>%
  ## calculate mean and SE for antigen for each variant
  group_by(variant) %>%
  summarise(n_pat = n(),
            FIX_antigen_mean = mean(FIX_antigen),
            FIX_antigen_se = sd(FIX_antigen) / sqrt(n_pat)) %>%
  ungroup() %>%
  ## replace SE = NA with 0, since there is only 1 patient
  replace_na(replace = list(FIX_antigen_se = 0))

###############################################################################

## Fig. 5a - antigen vs. light chain secretion score, n > 1 patient
eahad_antigen_plot <- eahad_antigen %>%
  ## join with secretion scores
  inner_join(scored_variants_average, by = "variant") %>%
  ## remove variants with no score
  filter(!is.na(average_score2)) %>%
  ## filter only light chain antibody
  filter(antibody == "124") %>%
  ## remove variants with < 2 patients
  filter(n_pat > 1) %>%
  ## plot
  ggplot(aes(x = average_score,
             y = FIX_antigen_mean)) +
  ## errorbars
  geom_errorbar(aes(ymin = FIX_antigen_mean - FIX_antigen_se,
                    ymax = FIX_antigen_mean + FIX_antigen_se),
                color = "grey40") +
  geom_errorbarh(aes(xmin = average_score - se_score,
                     xmax = average_score + se_score),
                 color = "grey40") +
  ## mean point
  geom_point(size = 1, color = "black") +
  ## correlation statistic
  stat_cor(aes(label = after_stat(r.label)),
           size = 6 / .pt) +
  ## synoymous threshold line
  geom_vline(data = syn_threshold %>%
               filter(antibody == "124"),
             aes(xintercept = lower),
             linetype = "dashed") +
  ## threshold for normal variation in FIX antigen levels
  ## 40% antigen
  geom_hline(aes(yintercept = 40), linetype = "dashed") +
  ## scale axes
  scale_x_continuous(expand = c(0, 0),
                     limits = c(-0.02, 1.12),
                     breaks = seq(0, 1, by = 0.2),
                     labels = function(x) as.character(x)) +
  scale_y_continuous(expand = c(0, 0),
                     limits = c(-15, 155),
                     breaks = seq(0, 150, by = 50),
                     labels = function(x) as.character(x)) +
  ## add axis titles
  labs(x = "Light chain secretion score",
       y = "Mean FIX antigen (%)")

## save Fig. 5a
ggsave(here("outputs", "main_fig_panels", "5a_antigen_vs_secretion.pdf"),
       plot = eahad_antigen_plot, device = cairo_pdf,
       height = 40, width = 40, units = "mm")

###############################################################################

## Fig. S10a - antigen vs. light chain secretion score, remove epitope-adjacent positions
eahad_antigen_plot_lc_epi <- eahad_antigen %>%
  ## join with secretion scores
  inner_join(scored_variants_average, by = "variant") %>%
  ## remove variants with no score
  filter(!is.na(average_score2)) %>%
  ## filter only light chain antibody
  filter(antibody == "124",
         !position %in% below_changepoint_lc) %>%
  ## remove variants with < 2 patients
  filter(n_pat > 1) %>%
  ## plot
  ggplot(aes(x = average_score,
             y = FIX_antigen_mean)) +
  ## errorbars
  geom_errorbar(aes(ymin = FIX_antigen_mean - FIX_antigen_se,
                    ymax = FIX_antigen_mean + FIX_antigen_se),
                color = "grey40", linewidth = 0.5, width = 0) +
  geom_errorbarh(aes(xmin = average_score - se_score,
                     xmax = average_score + se_score),
                 color = "grey40", linewidth = 0.5, height = 0) +
  ## mean point
  geom_point(size = 1) +
  ## correlation statistic
  stat_cor(aes(label = after_stat(r.label)),
           size = 6 / .pt) +
  ## synoymous threshold line
  geom_vline(data = syn_threshold %>%
               filter(antibody == "124"),
             aes(xintercept = lower),
             linetype = "dashed") +
  ## threshold for normal variation in FIX antigen levels
  ## 40% antigen
  geom_hline(aes(yintercept = 40), linetype = "dashed") +
  ## scale axes
  scale_x_continuous(expand = c(0, 0),
                     limits = c(-0.02, 1.12),
                     breaks = seq(0, 1, by = 0.2),
                     labels = function(x) as.character(x)) +
  scale_y_continuous(expand = c(0, 0),
                     limits = c(-15, 155),
                     breaks = seq(0, 150, by = 50),
                     labels = function(x) as.character(x)) +
  ## add axis titles
  labs(x = "Light chain secretion score",
       y = "Mean FIX antigen (%)")

## save Fig. S10a - epitope adjacent
ggsave(here("outputs", "supp_fig_panels", "S10a_antigen_vs_secretion_lc_epi.pdf"),
       plot = eahad_antigen_plot_lc_epi, device = cairo_pdf,
       height = 40, width = 40, units = "mm")

###############################################################################

## Fig. S11a - antigen vs. light chain secretion score, variant cysteines
eahad_antigen_plot_cys <- eahad_antigen %>%
  ## join with secretion scores
  inner_join(scored_variants_average, by = "variant") %>%
  ## remove variants with no score
  filter(!is.na(average_score2)) %>%
  ## filter only light chain antibody
  filter(antibody == "124",
         var_aa == "C") %>%
  ## plot
  ggplot(aes(x = average_score,
             y = FIX_antigen_mean)) +
  ## errorbars
  geom_errorbar(aes(ymin = FIX_antigen_mean - FIX_antigen_se,
                    ymax = FIX_antigen_mean + FIX_antigen_se),
                color = "grey40", linewidth = 0.5, width = 0) +
  geom_errorbarh(aes(xmin = average_score - se_score,
                     xmax = average_score + se_score),
                 color = "grey40", linewidth = 0.5, height = 0) +
  ## mean point
  geom_point(size = 1) +
  ## correlation statistic
  stat_cor(aes(label = after_stat(r.label)),
           size = 6 / .pt) +
  ## synoymous threshold line
  geom_vline(data = syn_threshold %>%
               filter(antibody == "124"),
             aes(xintercept = lower),
             linetype = "dashed") +
  ## threshold for normal variation in FIX antigen levels
  ## 40% antigen
  geom_hline(aes(yintercept = 40), linetype = "dashed") +
  ## scale axes
  scale_x_continuous(expand = c(0, 0),
                     limits = c(-0.02, 1.12),
                     breaks = seq(0, 1, by = 0.2),
                     labels = function(x) as.character(x)) +
  scale_y_continuous(expand = c(0, 0),
                     limits = c(-15, 155),
                     breaks = seq(0, 150, by = 50),
                     labels = function(x) as.character(x)) +
  ## add axis titles
  labs(x = "Light chain secretion score",
       y = "Mean FIX antigen (%)")

## save Fig. S11a - cysteine antigen
ggsave(here("outputs", "supp_fig_panels", "S11a_antigen_vs_secretion_cys.pdf"),
       plot = eahad_antigen_plot_cys, device = cairo_pdf,
       height = 40, width = 40, units = "mm")

```

```{r classify and identify discordant EAHAD antigen/secretion variants}

## Classify antigen and secretion levels for EAHAD variants
eahad_antigen_class <- eahad_antigen %>%
  ## join with secretion scores
  inner_join(scored_variants_average, by = "variant") %>%
  ## remove variants with no score
  filter(!is.na(average_score2)) %>%
  ## filter only light chain antibody
  filter(antibody == "124") %>%
  ## remove variants with < 2 patients
  filter(n_pat > 1) %>%
  ## create binary labels for scores
  mutate(antigen_class = case_when(FIX_antigen_mean < 40 ~ "low",
                                   FIX_antigen_mean >= 40 ~ "WT-like"),
         secretion_class = case_when(average_score < syn_threshold_wide %>%
                                       pull(lower_124) ~ "low",
                                     average_score >= syn_threshold_wide %>%
                                       pull(lower_124) ~ "WT-like"))
  

## Table of number of variants with low secretion and antigen
eahad_antigen_table <- eahad_antigen_class %>%
  ## count table
  tabyl(secretion_class, antigen_class) %>%
  adorn_totals(where = c("col", "row")) %>%
  adorn_percentages() %>%
  adorn_pct_formatting(digits = 1) %>%
  adorn_ns()

## Table S2 - discordant secretion-antigen variants
eahad_discordant <- eahad_antigen_class %>% 
  ## filter only variants with discordant labels
  filter(if_any(c(secretion_class, antigen_class), ~!grepl("low", .x))) %>%
  filter(if_any(c(secretion_class, antigen_class), ~!grepl("WT", .x))) %>%
  ## arrange nicely
  arrange(secretion_class, position) %>%
  ## keep only necessary columns
  select(variant, antigen_class, secretion_class, FIX_antigen_mean, FIX_antigen_se,
         average_score, se_score) %>%
  ## round values and correct 0.1 back to < 1%
  mutate(across(where(is.numeric), ~round(.x, digits = 2)),
         FIX_antigen_mean = gsub(0.1, "< 1", FIX_antigen_mean)) %>%
  ## rename columns for final table purposes
  rename(score_se = se_score,
         score_mean = average_score) %>%
  rename_with(~gsub("_", " ", .x)) %>%
  rename_with(~gsub("FIX ", "", .x)) %>%
  rename_with(~gsub("score", "secretion score", .x)) %>%
  rename_with(~str_to_sentence(.x)) %>%
  rename_with(~gsub(" se$", " SE", .x))

## Save Table S2 - Antigen-secretion score discordant variants from EAHAD
write_csv(eahad_discordant, here("outputs", "tables", "S2_EAHAD_discordant.csv"))

```

```{r eahad severity}

## extract EAHAD variants by severity
eahad_severity <- eahad_cleaned %>%
  ## remove variants without severity calls and light chain epitopes
  filter(!is.na(Severity),
         Severity != "-",
         !Position %in% light_chain_epitopes) %>%
  ## for each variant, count number of patients with each disease severity
  group_by(variant, Severity) %>%
  summarise(n_patients = n()) %>%
  ungroup() %>%
  ## keep only severity with most patients for each variant
  group_by(variant) %>%
  slice_max(order_by = n_patients) %>%
  add_count() %>%
  ungroup() %>%
  ## Remove variants with multiple associated disease severities 
  filter(n == 1) 

eahad_severity_scores <- eahad_severity %>%
  ## join with MultiSTEP scores
  inner_join(scored_variants_average, by = "variant") %>%
  ## remove variants with no score
  filter(!is.na(average_score2)) %>%
  ## keep only missense
  filter(wt_aa != var_aa,
         var_aa != "X")

```

```{r severity vs secretion vs antigen}

## Fig. 5b - severity vs. secretion
severity_plot <- eahad_severity_scores %>%
  ## light chain antibody
  filter(antibody == "124") %>%
  ## plot
  ggplot(aes(x = Severity,
             y = average_score)) +
  ## add points
  geom_jitter(color = "steelblue", size = 1.2, alpha = 0.2, width = 0.2, stroke = 0) +
  ## add violin and boxplot
  geom_violin(color = "black", fill = NA, scale = "width", linewidth = 0.3) +
  geom_boxplot(color = "black", fill = NA, width = 0.1, linewidth = 0.3, outliers = FALSE) + 
  ## add synonymous threshold
  geom_hline(data = syn_threshold %>%
              filter(antibody == "124"),
            aes(yintercept = lower), linetype = "dashed", linewidth = 0.3) +
  ## add statistical test
  stat_pwc(label = "{p.adj.signif}",
           method = "dunn_test",
           p.adjust.method = "bonferroni",
           y.position = 1.12,
           step.increase = 0.1,
           tip.length = 0.01,
           label.size = 6 / .pt) +
  ## scale y axis and color
  scale_y_continuous(expand = c(0, 0),
                     limits = c(-0.02, 1.52),
                     breaks = seq(0, 1.2, by = 0.2),
                     labels = function(x) as.character(x)) +
  labs(x = "Disease severity",
       y = "Light chain secretion score")

## save Fig. 5b
ggsave(here("outputs", "main_fig_panels", "5b_severity_vs_secretion.pdf"),
       plot = severity_plot, device = cairo_pdf,
       height = 40, width = 40, units = "mm")

###############################################################################

## Fig. S10b - severity vs. secretion - no epitope adjacent positions
severity_plot_noepi <- eahad_severity_scores %>%
  ## light chain antibody
  filter(antibody == "124") %>%
  ## remove epitope and adjacent positions
  filter(!position %in% below_changepoint_lc) %>%
  ## plot
  ggplot(aes(x = Severity,
             y = average_score)) +
  ## add points
  geom_jitter(color = "steelblue", size = 1.2, alpha = 0.2, width = 0.2, stroke = 0) +
  ## add violin and boxplot
  geom_violin(color = "black", fill = NA, scale = "width", linewidth = 0.3) +
  geom_boxplot(color = "black", fill = NA, width = 0.1, linewidth = 0.3, outliers = FALSE) + 
  ## add synonymous threshold
  geom_hline(data = syn_threshold %>%
              filter(antibody == "124"),
            aes(yintercept = lower), linetype = "dashed", linewidth = 0.3) +
  ## add statistical test
  stat_pwc(label = "{p.adj.signif}",
           method = "dunn_test",
           p.adjust.method = "bonferroni",
           y.position = 1.12,
           step.increase = 0.1,
           tip.length = 0.01,
           label.size = 6 / .pt) +
  ## scale y axis and color
  scale_y_continuous(expand = c(0, 0),
                     limits = c(-0.02, 1.52),
                     breaks = seq(0, 1.2, by = 0.2),
                     labels = function(x) as.character(x)) +
  labs(x = "Disease severity",
       y = "Light chain secretion score")

## save Fig. S10b
ggsave(here("outputs", "supp_fig_panels", "S10b_severity_vs_secretion_lc_epi.pdf"),
       plot = severity_plot_noepi, device = cairo_pdf,
       height = 40, width = 40, units = "mm")

###############################################################################

## severity vs. secretion, cysteine
severity_plot_cys <- eahad_severity_scores %>%
  ## light chain antibody
  filter(antibody == "124") %>%
  filter(var_aa == "C") %>%
  ## plot
  ggplot(aes(x = Severity)) +
  ## add points
  geom_bar(position = "dodge") +
  ## clean up axes
  scale_y_continuous(expand = c(0, 0),
                     limits = c(-0.05, 12.1),
                     breaks = seq(0, 12, by = 3),
                     labels = function(x) as.character(x)) +
  labs(x = "Severity",
       y = "Number of gain-of-cysteine variants")

## save Fig. S11b - severity for gain of cys variants
ggsave(here("outputs", "supp_fig_panels", "S11b_severity_vs_cys.pdf"),
       plot = severity_plot_cys, device = cairo_pdf,
       height = 40, width = 40, units = "mm")

## calculate proportion of cysteines in each class
cys_severity_table <- eahad_severity_scores %>%
  mutate(is_cys = case_when(var_aa == "C" ~ "gain-of-cysteine",
                            var_aa != "C" ~ "gain-of-non-cysteine"),
         sev_new = case_when(Severity == "Moderate" | Severity == "Severe" ~ "Moderate/Severe",
                             Severity == "Mild" ~ "Mild")) %>% 
  tabyl(is_cys, sev_new)

## Fisher's test
fisher_cysteine_sev <- fisher.test(cys_severity_table)

## calculate fraction of severe disease with secretion score < 0.05
sev_table_by_score <- eahad_severity_scores %>% 
  ## keep only light chain secretion score
  filter(antibody == "124") %>%
  ## add label for < 0.05 score
  mutate(is_lower = case_when(average_score > 0.05 ~ "not low",
                              average_score <= 0.05 ~ "low")) %>%
  ## count in each
  tabyl(Severity, is_lower) %>%
  adorn_totals(where = "col") %>%
  adorn_percentages("row") %>%
  adorn_pct_formatting(digits = 2) %>%
  adorn_ns()

## calculate fraction of severe disease with secretion score < 0.05
sev_table_by_score_noepi <- eahad_severity_scores %>% 
  ## keep only light chain secretion score
  filter(antibody == "124") %>%
  ## remove epitope-adjacent positions
  filter(!position %in% below_changepoint_lc) %>%
  ## add label for < 0.05 score
  mutate(is_lower = case_when(average_score > 0.05 ~ "not low",
                              average_score <= 0.05 ~ "low")) %>%
  ## count in each
  tabyl(Severity, is_lower) %>%
  adorn_totals(where = "col") %>%
  adorn_percentages("row") %>%
  adorn_pct_formatting(digits = 2) %>%
  adorn_ns()

###############################################################################

## calculate fraction of variants in each severity with WT-like or LOF secretion
sev_table_wt_lof <- eahad_antigen_class %>% 
  ## keep only light chain secretion score
  filter(antibody == "124") %>%
  ## add severity
  left_join(eahad_severity %>%
              select(variant, Severity),
            by = "variant") %>%
  ## remove NA
  filter(!is.na(Severity)) %>%
  ## count in each
  tabyl(Severity, antigen_class) %>%
  adorn_totals(where = "col") %>%
  adorn_percentages("row") %>%
  adorn_pct_formatting(digits = 2) %>%
  adorn_ns()

###############################################################################

## Fig. 5d - severity vs. antigen
severity_antigen <- eahad_antigen %>%
  ## join with eahad_severity
  inner_join(eahad_severity_scores, by = "variant") %>%
  ## keep only one example
  filter(antibody == "124") %>%
  ## plot
  ggplot(aes(x = Severity,
             y = FIX_antigen_mean)) +
  ## add points
  geom_jitter(color = "steelblue", size = 1.2, alpha = 0.3, width = 0.2, stroke = 0) +
  ## add violin and boxplot
  geom_violin(color = "black", fill = NA, scale = "width", linewidth = 0.3) +
  geom_boxplot(color = "black", fill = NA, width = 0.1, linewidth = 0.3, outliers = FALSE) + 
  ## threshold for normal variation in FIX antigen levels
  ## 40% antigen
  geom_hline(yintercept = 40, linetype = "dashed", linewidth = 0.3) +
  ## add statistical test
  stat_pwc(label = "{p.adj.signif}",
           method = "dunn_test",
           p.adjust.method = "bonferroni",
           y.position = 150,
           step.increase = 0.1,
           tip.length = 0.01,
           label.size = 6 / .pt) +
  ## scale y axis and color
  scale_y_continuous(expand = c(0, 0),
                     limits = c(-1, 205),
                     breaks = seq(0, 150, by = 50),
                     labels = function(x) as.character(x)) +
  labs(x = "Disease severity",
       y = "FIX antigen (%)")

## save Fig. 5d
ggsave(here("outputs", "main_fig_panels", "5d_severity_vs_antigen.pdf"),
       plot = severity_antigen, device = cairo_pdf,
       height = 40, width = 40, units = "mm")
```

```{r WT-like secretion severe variants}

## pull out severe variants
severe_vars <- eahad_severity_scores %>% 
  ## keep only severe
  filter(Severity == "Severe")

## count severe variants
severe_vars_count <- severe_vars %>%
  filter(antibody == "124") %>%
  count() %>%
  pull()

## extract WT-like severe variants
wt_like_severe <- severe_vars %>% 
  ## add synonymous score threshold
  left_join(syn_threshold, by = "antibody") %>%
  ## only keep severe variants with WT-like scores
  filter(average_score2 >= round(lower, digits = 2),
         ## keep light chain and FIX-specific carboxylation scores
         antibody %in% c("124", "001")) %>%
  ## split variant into useable columns
  extract(col = variant,
          into = c("wt_aa", "position", "var_aa"),
          regex = "([A-Z])([0-9]{1,3})([A-Z])",
          remove = FALSE) %>% 
  ## fix numeric
  mutate(position = as.numeric(position)) %>% 
  ## label with domains
  left_join(domains, by = "position")

## make wide for easy inspection
wt_like_severe_wide <- wt_like_severe %>%
  ## select useful columns
  select(variant, position, domain, antibody, average_score2) %>%
  ## make wide
  pivot_wider(names_from = antibody,
              names_prefix = "ab",
              values_from = average_score2) %>%
  ## arrange
  arrange(position)

## wt-like by domain
wt_like_severe_plot <- wt_like_severe %>%
  ## add better labels
  mutate(antibody_label = case_when(antibody == "001" ~ "FIX-specific\ncarboxylation\nscore",
                                    TRUE ~ "Secretion\nscore")) %>%
  ## plot
  ggplot(aes(y = antibody_label,
             fill = fct_rev(domain))) +
  ## stacked bar
  geom_bar(position = "stack") +
  ## clean up plot
  scale_x_continuous(expand = c(0, 0),
                    limits = c(0, 81),
                    breaks = seq(0, 80, by = 20),
                    labels = function(x) as.character(x)) +
  scale_fill_manual(values = rev(paletteer_d("PNWColors::Sunset"))) +
  ## add labels
  labs(x = "Variants with\nWT-like scores") +
  ## better legend
  guides(fill = guide_legend(reverse = TRUE)) +
  theme(legend.key.size = unit(3, "mm"),
        legend.title = element_blank(),
        legend.background = element_rect(fill = NA),
        legend.margin = margin(0, 0, 0, 0),
        legend.box.margin = margin(0, 5, 0, -5),
        axis.title.y = element_blank())

# save Fig. Sx
ggsave(here("outputs", "main_fig_panels", "5c_WT_like_severe.pdf"),
       plot = wt_like_severe_plot, device = cairo_pdf,
       height = 42, width = 60, units = "mm")

```

```{r elisa}

## import ELISA data from secreted FIX constructs, compare to secretion
elisa <- read_csv(here("inputs", "elisa", "secreted_elisas_vitK.csv")) %>%
  ## keep 48h timepoint
  ## remove NA concentrations (variants with values outside the standard curve)
  filter(timepoint == "48h",
         !is.na(concentration))

## t test for ELISA C28Y vs. no GOI (neg control)
elisa_t <- elisa %>%
  ## filter only negative control and C28Y
  filter(variant %in% c("C28Y", "noGOI")) %>%
  summarise(tt = t.test(concentration) %>%
              tidy())

## summarized elisa with secretion scores
elisa_sum <- elisa %>%
  ## group by variant and calculate summary statistics
  group_by(variant) %>%
  summarise(n = n(),
            mean_elisa = mean(concentration, na.rm = TRUE),
            se_elisa = sd(concentration) / sqrt(n)) %>%
  ## add secretion scores (heavy chain)
  left_join(scored_variants_average %>%
              filter(antibody == "102"), by = "variant") %>%
  ## convert noGOI scores to zero (definition of assay)
  ## and WT to 1
  mutate(across(contains("score"), ~case_when(variant == "WT" ~ 1,
                                              variant == "noGOI" ~ 0,
                                              TRUE ~ .x)),
         ## label C28Y
         label = case_when(variant == "C28Y" ~ "C28Y",
                           TRUE ~ NA_character_))
  
## ELISA vs. light chain
elisa_plot <- elisa_sum %>%
  ## plot
  ggplot(aes(x = average_score,
             y = mean_elisa)) +
  ## diagonal line
  geom_abline(linetype = "dashed", color = "grey20", slope = 100, intercept = 0) +
  ## errorbars
  geom_errorbar(aes(ymin = mean_elisa - se_elisa,
                    ymax = mean_elisa + se_elisa),
                color = "grey40") +
  geom_errorbarh(aes(xmin = average_score2 - se_score,
                     xmax = average_score2 + se_score),
                 color = "grey40") +
  ## mean point
  geom_point(aes(color = label), size = 0.7, show.legend = FALSE) +
  ## correlation statistic
  stat_cor(aes(label = after_stat(r.label)),
           size = 6 / .pt) +
  geom_text_repel(aes(label = label),
                  size = 6 / .pt,
                  nudge_y = 20,
                  nudge_x = -0.075,
                  min.segment.length = 0) +
  ## add axis titles
  labs(x = "Heavy chain secretion score",
       y = "Mean secreted FIX (% WT)") +
  scale_x_continuous(expand = c(0, 0),
                     limits = c(-0.05, 1.3),
                     breaks = seq(0, 1.25, by = 0.25),
                     labels = function(x) as.character(x)) + 
  scale_y_continuous(expand = c(0, 0),
                     limits = c(-5, 130),
                     breaks = seq(0, 125, by = 25),
                     labels = function(x) as.character(x)) +
  scale_color_manual(values = c("orange"), na.value = "black")

# save Fig. Sx - ELISA vs heavy chain
ggsave(here("outputs", "supp_fig_panels", "S2b_elisa.pdf"),
       plot = elisa_plot, device = cairo_pdf,
       height = 40, width = 40, units = "mm")

```

```{r clinical variants}

## clinical variants from MLOF, ClinVar, and gnomAD
curated_clinical_variants <- read_csv(here("inputs", "ClinVar",
                                           "clinical_training_variants.csv")) %>%
  ## extract variant into wt_aa, position, and mut_aa
  extract(F9_variant_hgvs, into = c("wt_aa", "position", "var_aa"),
          regex = "([A-Z])([0-9]+)([A-Z])", remove = FALSE) %>%
  ## rename variant column
  rename(variant = F9_variant_hgvs) %>%
  ## collapse B/LB and P/LP
  mutate(pathogenicity = case_when(grepl("[B/b]enign", Agreed_class) ~
                                     "Benign",
                                   grepl("[P/p]athogenic", Agreed_class) ~
                                     "Pathogenic"),
         ## make position numeric
         position = as.numeric(position)) %>%
  ## join with scored data
  left_join(scored_variants_average %>%
              select(antibody, variant, average_score),
            by = "variant") %>%
  ## remove Strep II tag antibody scores
  filter(antibody != "strep")

###############################################################################

## Fig. 5d - per antibody plot for separation of benign and pathogenic
curated_plot_by_ab <- curated_clinical_variants %>%
  ## create labels 
  left_join(antibody_table %>%
              filter(antibody != "strep"),
            by = "antibody") %>%
  left_join(syn_threshold, by = "antibody") %>%
  mutate(pathogenicity = case_when(pathogenicity == "Pathogenic" ~ "Pathogenic (P/LP)",
                                   pathogenicity == "Benign" ~ "Benign (B/LB)")) %>% 
  ## nest by labeling variable
  group_by(antibody_label2) %>% 
  nest() %>%
  ## create plots with map2 function
  mutate(plot = map2(data, antibody_label2,
                     ~ggplot(data = .x,
                             aes(x = average_score,
                                 fill = pathogenicity)) +
                       ## histogram of scores
                       geom_histogram(bins = 20, color = "black") +
                       ## synoymous threshold
                       geom_vline(aes(xintercept = unique(lower)),
                                  linetype = "dashed") +
                       ## adjust x and y axes
                       scale_x_continuous(expand = c(0, 0),
                                          limits = c(-0.1, 1.55),
                                          breaks = seq(0, 1.5, by = 0.5),
                                          labels = function(x) as.character(x)) +
                       scale_y_continuous(expand = c(0, 0),
                                          limits = c(-0.5, 40.5),
                                          breaks = seq(0, 40, by = 10),
                                          labels = function(x) as.character(x)) +
                       ## adjust fill colors 
                       scale_fill_manual(values = c("#2166ac","#b2182b")) +
                       ## add titles
                       labs(x = "Functional score",
                            y = "Number of variants",
                            title = antibody_label2))) %>%
  ## extract only plots into list
  pull(plot) %>%
  ## arrange list of plots
  wrap_plots(guides = "collect", nrow = 1) &
  theme(legend.key.size = unit(3, "mm"))

## save fig. 5d
ggsave(here("outputs", "main_fig_panels", "5d_clinical_vars_by_antibody.pdf"),
       plot = curated_plot_by_ab, device = cairo_pdf,
       height = 45, width = 197, units = "mm")

###############################################################################

## widen curated variants
curated_wide <- curated_clinical_variants %>%
  ## pivot to make each antibody score a column
  pivot_wider(names_from = antibody,
              names_prefix = "ab",
              values_from = average_score) %>%
  ## remove variants with missing scores
  filter(complete.cases(across(contains("ab")))) %>%
  select(-wt_aa, -position, -var_aa, -Agreed_class) 

```

```{r Random Forest Model for variant function}
  
## reset seed to make sure it's reproducible every time
set.seed(627)

## split data
clinical_split <- initial_split(curated_wide,
                                strata = pathogenicity)

## split into variables for easier calling
clinical_training <- training(clinical_split)
clinical_test <- testing(clinical_split)

## create pre-processing recipe for doing the modeling
clinical_recipe <- recipe(pathogenicity ~ .,
                          data = clinical_training) %>%
  ## update variants to ID, since we'll join on that later
  ## but don't want to use it as a predictor
  update_role(variant, new_role = "ID") %>%
  step_rose(pathogenicity, seed = 627)

## build model to tune hyperparameters
rf_tune <- rand_forest(mtry = tune(),
                       min_n = tune(),
                       trees = 100) %>%
 set_mode("classification") %>%
 set_engine("ranger")

## combine into workflow
tune_workflow <- workflow() %>%
  add_recipe(clinical_recipe) %>%
  add_model(rf_tune)

## split training into folds
clinical_folds <- vfold_cv(clinical_training, v = 5,
                            strata = pathogenicity)


## create a grid of values to try
rf_grid <- grid_regular(mtry(range = c(1, 4)),
                        min_n(range = c(1, 40)),
                        levels = 40)

## tune hyperparameters
tuned_clinical <- tune_grid(tune_workflow,
                            resamples = clinical_folds,
                            grid = rf_grid)

## capture best performing hyperparameters
best_auc <- select_best(tuned_clinical, metric = "roc_auc")

## finalize model
final_rf_model <- finalize_model(rf_tune, best_auc)

## final workflow
final_wf <- workflow() %>%
  add_recipe(clinical_recipe) %>%
  add_model(final_rf_model)

## last fit on data
final_res <- final_wf %>%
  last_fit(clinical_split)

###############################################################################

## set metrics to collect
collect_performance1 <- metric_set(sens, spec, npv, ppv)
collect_performance2 <- metric_set(roc_auc, pr_auc)

## output
output_res <- final_res %>%
  collect_predictions() 

## collect test performance values
output_performance <- rbind(
  collect_performance1(output_res, 
                       truth = pathogenicity,
                       estimate = .pred_class,
                       event_level = "second"),
  collect_performance2(output_res,
                       truth = pathogenicity,
                       .pred_Pathogenic,
                       event_level = "second"))

output_sens <- output_performance %>%
  filter(.metric == "sens") %>% 
  pull(.estimate)

output_spec <- output_performance %>%
  filter(.metric == "spec") %>% 
  pull(.estimate)

output_roc <- output_performance %>%
  filter(.metric == "roc_auc") %>% 
  pull(.estimate)

###############################################################################

## Fig. 5f - ROC curve
roc_plot <- output_res %>%
  roc_curve(truth = pathogenicity, .pred_Pathogenic,
            event_level = "second") %>%
  ggplot(aes(x = 1 - specificity, y = sensitivity)) +
  geom_path() +
  geom_abline(lty = 3) +
  annotate(geom = "point",
           x = 1 - output_spec,
           y = output_sens) +
  scale_x_continuous(breaks = seq(0, 1, by = 0.25),
                     labels = function(x) as.character(x)) + 
  scale_y_continuous(breaks = seq(0, 1, by = 0.25),
                     labels = function(x) as.character(x)) +
  annotate(geom = "text", x = 0, y = 0.95, hjust = 0,
           label = paste0("AUC: ", round(output_roc, digits = 3)),
           size = 6 / .pt) +
  labs(x = "1 - Specificity",
       y = "Sensitivity") +
  theme(axis.title = element_text(size = 6),
        axis.text = element_text(size = 6))

## save Fig. 5f - ROC curve
ggsave(here("outputs", "main_fig_panels", "5f_ROC-AUC.pdf"),
       plot = roc_plot, device = cairo_pdf,
       height = 45, width = 45, units = "mm")

```

```{r predict all variants with random forest model}

## collect scores for all variants
new_data_for_pred <- scored_variants_average %>%
  ## create variant label for merging
  mutate(variant = paste0(wt_aa, position, var_aa)) %>%
  ## keep only necessary columns
  select(wt_aa, position, var_aa, variant, antibody, average_score) %>%
  ## make every antibody a column to match with prediction dataframe
  pivot_wider(names_from = antibody,
              names_prefix = "ab",
              values_from = average_score) %>%
  ## remove any missing data
  filter(!if_any(everything(), is.na))

###############################################################################

## extract workflow to re-perform analysis on new variants
new_workflow <- final_res %>%
  extract_workflow()

## make predictions for all new variants
new_predictions <- augment(new_workflow, new_data_for_pred) %>%
  ## create labels
  mutate(label = case_when(.pred_class == "Benign" ~ "WT-like",
                           .pred_class == "Pathogenic" ~ "Loss of function"))

## Table S3 - predictions for all variants
variant_model_results <- new_predictions %>%
  ## fill in missing positions and possible variants, keep other columns NA
  # complete(position = seq(1, nchar(wt_FIX), by = 1), var_aa,
  #          fill = list(NA)) %>%
  # ## replace NA values for wt_aa and diff_aa for missing variants since 
  # ## complete fills in NA for missing variables
  # mutate(wt_aa = wt_FIX_aa$wt_aa[position],
  #        variant = paste0(wt_aa, position, var_aa),
  #        var_aa = as.character(var_aa)) %>%
  ## remove nonsense
  filter(var_aa != "X") %>%
  ## rename antibody columns by pivoting longer and wider
  pivot_longer(cols = starts_with("ab"),
               names_to = "antibody_nonnum",
               values_to = "value") %>%
  left_join(antibody_table, by = "antibody_nonnum") %>%
  select(-antibody_label2, -antibody_nonnum, -antibody) %>%
  pivot_wider(names_from = "antibody_label",
              names_glue = "{antibody_label} score",
              values_from = "value") %>%
  ## arrange
  arrange(position, as.character(var_aa)) %>%
  ## keep only necessary columns
  select(variant, contains("antibody"), label) %>%
  ## rename remaining columns
  rename(Variant = variant,
         `Model prediction` = label)

## Save Table S3 - Random forest predictions for all variants
write_csv(variant_model_results, here("outputs", "tables","S3_RFmodel_results.csv"))

###############################################################################

## predictions for only missense
new_predictions_missense <- new_predictions %>%
  filter(var_aa != wt_aa) %>%
  filter(var_aa != "X")

## predictions on training + test set (for numbers)
known_predictions <- new_predictions %>%
  inner_join(curated_wide,
             by = c("variant", "ab001", "ab102", "ab124", "ab3570"))

## predictions on non-curated variants
unseen_predictions <- new_predictions %>%
  anti_join(curated_wide, 
            by = c("variant", "ab001", "ab102", "ab124", "ab3570"))

###############################################################################

## table for all variant predictions
prediction_table_all_variants <- new_predictions %>%
  tabyl(label) %>%
  adorn_pct_formatting(digits = 2) %>%
  adorn_totals("row")

## table for all variant predictions - missense only
prediction_table_missense <- new_predictions_missense %>%
  tabyl(label) %>%
  adorn_pct_formatting(digits = 2) %>%
  adorn_totals("row")

## comparison of cysteine to non-cysteine missense variants
prediction_table_cysteine <- new_predictions_missense %>%
  mutate(is_cys = case_when(var_aa == "C" ~ "gain-of-cysteine",
                            var_aa != "C" ~ "gain-of-non-cysteine")) %>% 
  tabyl(is_cys, label)

## Fisher's test
fisher_cysteine <- fisher.test(prediction_table_cysteine)

## add totals
prediction_table_cysteine <- prediction_table_cysteine %>%
  adorn_totals("col") %>%
  adorn_percentages("row") %>%
  adorn_pct_formatting(digits = 2) %>%
  adorn_ns()

## prediction by severity
severity_model_performance <- eahad_severity_scores %>% 
  ## keep only useful columns
  select(variant, Severity) %>%
  distinct() %>%
  ## join with severity
  left_join(new_predictions_missense, by = "variant") %>%
  ## change prediction names
  mutate(prediction = case_when(.pred_class == "Pathogenic" ~ "Loss of function",
                                .pred_class == "Benign" ~ "WT-like"))

## prediction by severity table
severity_model_performance_table <- severity_model_performance %>%
  tabyl(prediction, Severity) %>%
  adorn_totals("row") %>%
  adorn_percentages("col") %>%
  adorn_pct_formatting(digits = 2) %>%
  adorn_ns()

## Fig. S12 - model vs. severity plot
severity_model_performance_plot <-severity_model_performance %>%
  ## clean predictions
  mutate(prediction = case_when(prediction == "Loss of function" ~ "Abnormal function",
                                prediction == "WT-like" ~ "Normal function")) %>%
  ## plot
  ggplot(aes(x = Severity,
             fill = prediction)) +
  ## stacked bar
  geom_bar(position = "dodge", color = "black", linewidth = 0.5) +
  ## clean axes 
  scale_y_continuous(expand = c(0, 0),
                     limits = c(0, 205),
                     breaks = seq(0, 200, by = 50),
                     labels = function(x) as.character(x)) +
  ## change colors
  scale_fill_manual(values = c("#b2182bFF", "#2166acFF")) +
  ## add labs
  labs(y = "Number of variants") +
  ## change theme and legend
  theme(legend.key.size = unit(3, "mm"),
        legend.position = "inside",
        legend.position.inside = c(0.33, 0.88),
        legend.background = element_blank(),
        legend.title = element_blank())

## save Fig. S12a - classification by severity
ggsave(here("outputs", "supp_fig_panels", "S12a_model_by_severity.pdf"),
       plot = severity_model_performance_plot, device = cairo_pdf,
       height = 40, width = 43, units = "mm")
  
## only PP/Gla severity model performance plot
severity_model_performance_plot_ppgla <-severity_model_performance %>%
  ## clean predictions
  mutate(prediction = case_when(prediction == "Loss of function" ~ "Abnormal function",
                                prediction == "WT-like" ~ "Normal function")) %>%
  ## keep only PP and Gla domains
  filter(position %in% seq(29, 92, by = 1)) %>%
  ## plot
  ggplot(aes(x = Severity,
             fill = prediction)) +
  ## stacked bar
  geom_bar(position = "dodge", color = "black", linewidth = 0.5) +
  ## clean axes 
  scale_y_continuous(expand = c(0, 0),
                     limits = c(0, 26),
                     breaks = seq(0, 25, by = 5),
                     labels = function(x) as.character(x)) +
  ## change colors
  scale_fill_manual(values = c("#b2182bFF", "#2166acFF")) +
  ## add labs
  labs(y = "Number of variants") +
  ## change theme and legend
  theme(legend.key.size = unit(3, "mm"),
        legend.position = "inside",
        legend.position.inside = c(0.33, 0.88),
        legend.title = element_blank(),
        legend.background = element_blank())

## save Fig. S12b - classification by severity
ggsave(here("outputs", "supp_fig_panels", "S12b_model_by_severity_ppgla.pdf"),
       plot = severity_model_performance_plot_ppgla, device = cairo_pdf,
       height = 40, width = 43, units = "mm")


```

```{r MLOF data}

## read in MLOF data (2017 data freeze)
mlof <- read_csv(here("inputs", "MLOF", "2017_F9_MLOF.csv")) %>%
  ## clean names
  clean_names(replace = c("ExAC" = "Exac", "cDNA" = "cdna")) %>%
  ## remove noncoding variants
  filter(grepl("Exon", exon),
         !grepl("UTR", exon),
         !grepl("Exons", exon)) %>%
  ## remove frameshifts, deletions, duplications, and insertions
  filter(!grepl("del|dup|ins", hgvs_cdna)) %>%
  ## remove synonymous and nonsense variants
  filter(!grepl("=|\\*", hgvs_protein)) %>%
  ## remove p. and split HGVS protein call
  mutate(hgvs_protein = gsub("^p.", "", hgvs_protein),
         hgvs_protein = gsub("^c.", "", hgvs_protein)) %>%
  extract(hgvs_protein, into = c("wt_aa3", "position", "var_aa3"),
          regex = "(^[A-z]+)([0-9]+)([A-z]+)") %>%
  mutate(position = as.numeric(position),
         var_aa3 = gsub("pro", "Pro", var_aa3)) %>%
  ## turn into 1 aa abbreviations
  mutate(var_aa = unlist(mget(var_aa3, hash_aa3_to_aa1@.xData)),
         wt_aa = unlist(mget(wt_aa3, hash_aa3_to_aa1@.xData)),
         variant = paste0(wt_aa, as.character(position), var_aa))

###############################################################################

## calculate number of each type of evidence in MLOF
classified_mlof <- mlof %>%
  group_by(variant, pathogenicity, hgvs_cdna) %>%
  summarise(moderate = sum(unlist(select(pick(everything()),
                                         starts_with('pm'))), na.rm = TRUE),
            strong = sum(unlist(select(pick(everything()),
                                       starts_with('ps'))), na.rm = TRUE),
            very_strong = sum(unlist(select(pick(everything()),
                                            starts_with('pvs'))), na.rm = TRUE),
            supporting = sum(unlist(select(pick(everything()),
                                           starts_with('pp'))), na.rm = TRUE)) %>%
  ## join with RF model predictions
  inner_join(new_predictions, by = "variant") %>%
  ## increment number of strong/moderate evidence by 1 if predicted LOF by model
  mutate(strong2 = case_when(.pred_class == "Pathogenic" ~ strong + 1,
                             TRUE ~ strong),
         moderate2 = case_when(.pred_class == "Pathogenic" ~ moderate + 1,
                               TRUE ~ moderate),
         ## reclassify variants based on Richards 2015 and RF model output
         ## based on integrated secretion and carboxylation scores
              ## moderate2 used for Bayesian classification (OddsPath)
              ## strong2 used for PS3 in ACMG criteria (2015)
         evNone = reclassify_vars(very_strong, strong, moderate, supporting),
         evModerate = reclassify_vars(very_strong, strong, moderate2, supporting),
         evStrong = reclassify_vars(very_strong, strong2, moderate, supporting)) %>%
  ungroup()

###############################################################################

## Fig 5h - Sankey diagram of reclassified variants
sankey_reclass <- classified_mlof %>%
  make_long(evNone, evModerate, evStrong) %>%
  inner_join(classified_mlof %>%
               make_long(evNone, evModerate, evStrong) %>%
               group_by(x, node) %>%
               count(),
             by = c("x", "node")) %>%
  mutate(node = factor(node, levels = c("VUS", "LP", "P"))) %>%
  ggplot(aes(x = x,
             next_x = next_x,
             node = node,
             next_node = next_node,
             fill = node,
             label = n)) +
  geom_sankey(flow.alpha = 0.5, 
              color = "black",
              node.color = "black",
              show.legend = FALSE) +
  geom_sankey_label(size = 6 / .pt, color = "black", fill = NA, label.size = NA) +
  scale_fill_manual(values = c("grey50", "coral", "firebrick3")) +
  scale_x_discrete(expand = c(0, 0),
                   labels = c(evNone = "None",
                              evModerate = "Moderate",
                              evStrong = "Strong")) +
  theme_void(base_size = 6) +
  theme(legend.position = "none",
        axis.text.x = element_text(),
        axis.title.x = element_text()) +
  labs(x = "Level of evidence applied")

## save Fig. 5h
ggsave(here("outputs", "main_fig_panels", "5h_Sankey.pdf"),
       plot = sankey_reclass, device = cairo_pdf,
       height = 45, width = 80, units = "mm")

###############################################################################


## Table S4 - MLOF variant evidence codes
mlof_evidence_table <- mlof %>%
  ## join with predictions
  inner_join(classified_mlof, by = c("variant", "hgvs_cdna", "wt_aa", "position", "var_aa")) %>%
  ## create functional prediction column
  mutate(RFclass = case_when(.pred_class == "Pathogenic" ~ "Loss of function",
                             .pred_class == "Benign" ~ "WT-like")) %>%
  ## arrange in variant order
  arrange(position, var_aa) %>%
  ## keep only needed columns
  select(variant, hgvs_cdna, RFclass, matches("^ev|^pvs|^ps|^pm|^pp|^bs|^bm|^bp|^ba")) %>%
  select(variant, hgvs_cdna, RFclass, evNone, evModerate, evStrong, order(colnames(.))) %>%
  ## rename evidence columns to remove _xxx
  rename(`Variant` = variant,
         `HGVS cDNA sequence` = hgvs_cdna,
         `Random forest model prediction` = RFclass,
         `Class without additional evidence` = evNone,
         `Class with moderate evidence` = evModerate,
         `Class with strong evidence` = evStrong) %>%
  rename_with(~gsub("_.*", "", .x)) %>%
  rename_with(toupper, matches("([0-9])")) %>%
  ## reorganize columns
  relocate(starts_with("PM"), .after = PP5) %>%
  relocate(starts_with("BA"), .after = BS4) %>%
  ## clarify ev columns
  mutate(across(starts_with("Class"), ~case_when(.x == "VUS" ~
                                                   "Variant of Uncertain Significance",
                                                 .x == "LP" ~ "Likely Pathogenic",
                                                 .x == "P" ~ "Pathogenic")))

## Save Table S4 - MLOF evidence codes
write_csv(mlof_evidence_table, here("outputs", "tables","S4_MLOF_evidence.csv"))

```

```{r other proteins}

## read in flow data from other secreted proteins
other_sec_proteins_flow <- read_csv((here("inputs", "flow", "other_proteins",
                                          "all_cells_concatenated.csv"))) %>% 
  mutate(Label = gsub("del ", "\u0394", Label),
         Label = gsub("se C1", "se\nC1", Label),
         Label = gsub("ned c", "ned\nc", Label),
         Label = gsub("nal p", "nal\np", Label),
         Label = gsub("-1 an", "-1\nan", Label),
         Label = factor(Label,
                        levels = c("Unrecombined\ncells", "\u0394start",
                                   "TM only", "FIX \u0394signal\npeptide",
                                   "Insulin", "Plasma protease\nC1 inhibitor",
                                   "FVII", "FVIII", "FIX", "FX",
                                   "Alpha-1\nantitrypsin")),
         ## adjust so that no negative values remain and
         ## add small pseudocount for log plotting to prevent log(0) errors
         adj_fluorescence = Fluorescence + abs(min(Fluorescence)) + 0.01)

## Fig. 6a - other secreted proteins
other_proteins_plot <- other_sec_proteins_flow %>%
  ggplot() +
  geom_density_ridges(aes(x = adj_fluorescence,
                          y = Label,
                          fill = Label,
                          color = Label,
                          height = after_stat(ndensity)),
                      show.legend = FALSE, alpha = 0.7, bandwidth = 1/32,
                      scale = 0.9) +
  scale_x_log10(expand = c(0, 0),
                limits = c(9.95, 1.05e4),
                breaks = trans_breaks("log10", function(x) 10^x, n = 4),
                labels = trans_format("log10", math_format(10^.x))) +
  scale_y_discrete(expand = expansion(add = c(0.5, 1.2)),
                   limits = rev) +
  scale_fill_viridis_d(option = "C", end = 0.8) +
  scale_color_viridis_d(option = "C", end = 0.8) +
  labs(x = "Strep II tag-Alexa-488") +
  theme(panel.grid.major.y = element_line(color = "grey40"),
        axis.title.y = element_blank())

## save Figure 6a
ggsave(here("outputs", "main_fig_panels", "6a_other_secreted_proteins.pdf"),
       plot = other_proteins_plot, device = cairo_pdf,
       height = 107.5, width = 75, units = "mm")

```

```{r FVIII}

## import FVIII data for A1-A3 antibody
FVIII_8012 <- list.files(path = here("inputs", "flow",
                                   "FVIII_GMA8012",
                                   "scale"),
                             pattern = "*.csv",
                             recursive = TRUE) %>%
  map_df(~read_flow_path(here("inputs", "flow",
                              "FVIII_GMA8012", 
                              "scale", .))) %>%
  ## clean names
  clean_names() %>%
  ## rename Flowjo default variable names
  rename(ab8012 = alexa_fluor_647_a,
         bfp = bv421_a,
         mcherry = pe_cf594_a) %>%
  ## extract source path column to usable variables
  extract(col = source_path, into = c("variant", "fluor"),
          regex = ".*293F_([A-Z]*)_.*_(.*)\\.csv") %>%
  ## adjust so that no negative values remain and
  ## add small pseudocount for log plotting to prevent log(0) errors
  mutate(adj_ab8012 = ab8012 + abs(min(ab8012)) + 0.01,
         ## change names and order
         variant = case_when(variant == "LP" ~ "NC",
                             TRUE ~ variant),
         variant = factor(variant,
                          levels = c("NC", "FVIII")))

###############################################################################

## Fig. 6b - FVIII A1-A3 domain
FVIII_8012_plot <- FVIII_8012 %>%
  ggplot() +
  geom_density(aes(x = adj_ab8012,
                   y = after_stat(scaled),
                   fill = variant,
                   color = variant),
               alpha = 0.7, bw = 1/64) +
  ## adjust fill, x and y axes
  scale_x_log10(expand = c(0, 0),
                limits = c(9.95, 1.05e6),
                breaks = trans_breaks("log10", function(x) 10^x, n = 4),
                labels = trans_format("log10", math_format(10^.x))) +
  scale_y_continuous(expand = c(0, 0),
                     limits = c(-0.01, 1.01),
                     breaks = seq(0, 1, by = 0.25),
                     labels = function(x) as.character(x)) +
  scale_fill_manual(values = paletteer_d("PNWColors::Bay")[c(1, 5)]) +
  scale_color_manual(values = paletteer_d("PNWColors::Bay")[c(1, 5)]) +
  ## add labels
  labs(x = "FVIII A1-A3 domains-Alexa-647",
       y = "Density") +
  ## adjust legend location and features
  theme(legend.position = "inside", 
        legend.position.inside = c(0, 1),
        legend.justification = c(0, 1),
        legend.key.size = unit(3, "mm"),
        legend.title = element_blank(),
        legend.background = element_rect(fill = NA))

## save Fig. 6b
ggsave(here("outputs", "main_fig_panels", "6b_FVIII_A1A3.pdf"),
       plot = FVIII_8012_plot, device = cairo_pdf,
       height = 35, width = 66, units = "mm")

###############################################################################

## import FVIII data for discontinuous A2 antibody
FVIII_012 <- list.files(path = here("inputs", "flow",
                                   "FVIII_GMA012",
                                   "scale"),
                             pattern = "*.csv",
                             recursive = TRUE) %>%
  map_df(~read_flow_path(here("inputs", "flow",
                              "FVIII_GMA012", 
                              "scale", .))) %>%
  ## clean names
  clean_names() %>%
  ## rename Flowjo default variable names
  rename(ab012 = alexa_fluor_647_a,
         bfp = bv421_a,
         mcherry = pe_cf594_a) %>%
  ## extract source path column to usable variables
  extract(col = source_path, into = c("variant", "fluor"),
          regex = ".*293F_([A-Z]*)_(.*)\\.csv") %>%
  ## adjust so that no negative values remain and
  ## add small pseudocount for log plotting to prevent log(0) errors
  mutate(adj_ab012 = ab012 + abs(min(ab012)) + 0.01,
         ## change names and order
         variant = case_when(variant == "LP" ~ "NC",
                             TRUE ~ variant),
         variant = factor(variant,
                          levels = c("NC", "FVIII")))

###############################################################################

## Fig. 6c - FVIII A2 domain
FVIII_012_plot <- FVIII_012 %>%
  ggplot() +
  geom_density(aes(x = adj_ab012,
                   y = after_stat(scaled),
                   fill = variant,
                   color = variant),
               alpha = 0.7, bw = 1/64) +
  ## adjust fill, x and y axes
  scale_x_log10(expand = c(0, 0),
                limits = c(9.95, 1.05e5),
                breaks = trans_breaks("log10", function(x) 10^x, n = 4),
                labels = trans_format("log10", math_format(10^.x))) +
  scale_y_continuous(expand = c(0, 0),
                     limits = c(-0.01, 1.01),
                     breaks = seq(0, 1, by = 0.25),
                     labels = function(x) as.character(x)) +
  scale_fill_manual(values = paletteer_d("PNWColors::Bay")[c(1, 5)]) +
  scale_color_manual(values = paletteer_d("PNWColors::Bay")[c(1, 5)]) +
  ## add labels
  labs(x = "FVIII discontinuous A2 domain-Alexa-647",
       y = "Density") +
  ## adjust legend location and features
  theme(legend.position = "inside",
        legend.position.inside = c(0, 1),
        legend.justification = c(0, 1),
        legend.key.size = unit(3, "mm"),
        legend.title = element_blank(),
        legend.background = element_rect(fill = NA))

## save Fig. 6c
ggsave(here("outputs", "main_fig_panels", "6c_FVIII_A2.pdf"),
       plot = FVIII_012_plot, device = cairo_pdf,
       height = 35, width = 66, units = "mm")

```

```{r FVIII variants}

## import FVIII data for A1 antibody
FVIII_vars_8005 <- list.files(path = here("inputs", "flow",
                                          "FVIII_vars_GMA8005",
                                          "scale"),
                              pattern = "*.csv",
                              recursive = TRUE) %>%
  map_df(~read_flow_path(here("inputs", "flow",
                              "FVIII_vars_GMA8005", 
                              "scale", .))) %>%
  ## clean names
  clean_names() %>%
  ## rename Flowjo default variable names
  rename(ab8005 = alexa_fluor_647_a) %>%
  ## extract source path column to usable variables
  extract(col = source_path, into = c("variant"),
          regex = ".*scale_8005_(.*?)_.*\\.csv",
          remove = FALSE) %>%
  ## change names and order
  mutate(variant = case_when(variant == "LP" ~ "NC",
                             variant == "FRED" ~ "FVIII",
                             TRUE ~ variant),
         variant = factor(variant,
                          levels = c("NC", "FVIII", "R550C", "R2326P", "W2081C",
                                     "R2169H", "R301C")))

###############################################################################

## Fig. S14a - FVIII WT A1 epitope
FVIII_WT_8005_plot <- FVIII_vars_8005 %>%
  filter(ab8005 > 0,
         variant %in% c("NC", "FVIII")) %>%
  ggplot() +
  geom_density(aes(x = ab8005,
                   y = after_stat(scaled),
                   fill = variant,
                   color = variant),
               alpha = 0.7, bw = 1/64) +
  ## adjust fill, x and y axes
  scale_x_log10(expand = c(0, 0),
                limits = c(0.05, 1.05e5),
                breaks = trans_breaks("log10", function(x) 10^x, n = 6),
                labels = trans_format("log10", math_format(10^.x))) +
  scale_y_continuous(expand = c(0, 0),
                     limits = c(-0.01, 1.01),
                     breaks = seq(0, 1, by = 0.25),
                     labels = function(x) as.character(x)) +
  scale_fill_manual(values = paletteer_d("PNWColors::Bay")[c(1, 5)]) +
  scale_color_manual(values = paletteer_d("PNWColors::Bay")[c(1, 5)]) +
  ## add labels
  labs(x = "FVIII A1 domain-Alexa-647",
       y = "Density") +
  ## adjust legend location and features
  theme(legend.position = "inside",
        legend.position.inside = c(0, 1),
        legend.justification = c(0, 1),
        legend.key.size = unit(3, "mm"),
        legend.title = element_blank(),
        legend.background = element_rect(fill = NA))

## save Fig. S14a
ggsave(here("outputs", "supp_fig_panels", "S14a_FVIII_WT_A1.pdf"),
       plot = FVIII_WT_8005_plot, device = cairo_pdf,
       height = 45, width = 35, units = "mm")

###############################################################################

## Fig. 6d - FVIII variants A1 epitope
FVIII_vars_8005_plot <- FVIII_vars_8005 %>%
  filter(ab8005 > 0) %>%
  ggplot() +
  geom_density_ridges(aes(x = ab8005,
                          y = fct_rev(variant),
                          height = after_stat(ndensity),
                          fill = variant,
                          color = variant),
                      alpha = 0.5, bandwidth = 1/64, scale = 1,
                      show.legend = FALSE) +
  ## adjust fill, x and y axes
  scale_x_log10(expand = c(0, 0),
                limits = c(0.95, 1.05e5),
                breaks = trans_breaks("log10", function(x) 10^x, n = 3),
                labels = trans_format("log10", math_format(10^.x))) +
  scale_fill_manual(values = paletteer_d("PNWColors::Bay", n = 7, type = "continuous")[c(1, 7, 2, 3, 4, 5, 6)]) +
  scale_color_manual(values = paletteer_d("PNWColors::Bay", n = 7, type = "continuous")[c(1, 7, 2, 3, 4, 5, 6)]) +
  ## add labels
  labs(x = "FVIII A1 domain-Alexa-647") +
  ## adjust legend location and features
  theme(axis.title.y = element_blank())

## save Fig. 6d
ggsave(here("outputs", "main_fig_panels", "6d_FVIII_vars_A1.pdf"),
       plot = FVIII_vars_8005_plot, device = cairo_pdf,
       height = 35, width = 45, units = "mm")


###############################################################################

## import FVIII data for A2 antibody
FVIII_wt_8016 <- list.files(path = here("inputs", "flow",
                                        "FVIII_wt_GMA8016",
                                        "scale"),
                            recursive = TRUE) %>%
  map_df(~read_flow_path(here("inputs", "flow",
                              "FVIII_wt_GMA8016", 
                              "scale", .))) %>%
  ## clean names
  clean_names() %>%
  ## rename Flowjo default variable names
  rename(ab8016 = alexa_fluor_647_a) %>%
  ## extract source path column to usable variables
  extract(col = source_path, into = c("variant"),
          regex = ".*HEK293F_(.*?)\\s.*\\.csv",
          remove = FALSE) %>%
  ## change names and order
  mutate(variant = case_when(variant == "LP" ~ "NC",
                             TRUE ~ variant),
         variant = factor(variant,
                          levels = c("NC", "FVIII")))

###############################################################################

## Fig. S14b - FVIII WT A2 epitope
FVIII_WT_8016_plot <- FVIII_wt_8016 %>%
  filter(ab8016 > 0) %>%
  ggplot() +
  geom_density(aes(x = ab8016,
                   y = after_stat(scaled),
                   fill = variant,
                   color = variant),
               alpha = 0.7, bw = 1/64) +
  ## adjust fill, x and y axes
  scale_x_log10(expand = c(0, 0),
                limits = c(0.05, 5.05e5),
                breaks = trans_breaks("log10", function(x) 10^x, n = 6),
                labels = trans_format("log10", math_format(10^.x))) +
  scale_y_continuous(expand = c(0, 0),
                     limits = c(-0.01, 1.01),
                     breaks = seq(0, 1, by = 0.25),
                     labels = function(x) as.character(x)) +
  scale_fill_manual(values = paletteer_d("PNWColors::Bay")[c(1, 5)]) +
  scale_color_manual(values = paletteer_d("PNWColors::Bay")[c(1, 5)]) +
  ## add labels
  labs(x = "FVIII A2 domain-Alexa-647",
       y = "Density") +
  ## adjust legend location and features
  theme(legend.position = "inside",
        legend.position.inside = c(0, 1),
        legend.justification = c(0, 1),
        legend.key.size = unit(3, "mm"),
        legend.title = element_blank(),
        legend.background = element_rect(fill = NA))

## save Fig. S14b
ggsave(here("outputs", "supp_fig_panels", "S14b_FVIII_WT_A2.pdf"),
       plot = FVIII_WT_8016_plot, device = cairo_pdf,
       height = 45, width = 35, units = "mm")

###############################################################################

## import FVIII data for A2 discontinuous antibody
FVIII_vars_012 <- list.files(path = here("inputs", "flow",
                                   "FVIII_vars_GMA012",
                                   "scale"),
                             pattern = "*.csv",
                             recursive = TRUE) %>%
  map_df(~read_flow_path(here("inputs", "flow",
                              "FVIII_vars_GMA012", 
                              "scale", .))) %>%
  ## clean names
  clean_names() %>%
  ## rename Flowjo default variable names
  rename(ab012 = alexa_fluor_647_a) %>%
  ## extract source path column to usable variables
  extract(col = source_path, into = c("variant"),
          regex = ".*scale_012_(.*?)_.*\\.csv",
          remove = FALSE) %>%
  ## change names and order
  mutate(variant = case_when(variant == "LP" ~ "NC",
                             variant == "FRED" ~ "FVIII",
                             TRUE ~ variant),
         variant = factor(variant,
                          levels = c("NC", "FVIII", "R550C", "R2326P", "W2081C",
                                     "R2169H", "R301C")))

###############################################################################

## Fig. 6e - FVIII variants A2 discontinuous epitope
FVIII_vars_012_plot <- FVIII_vars_012 %>%
  filter(ab012 > 0) %>%
  ggplot() +
  geom_density_ridges(aes(x = ab012,
                          y = fct_rev(variant),
                          height = after_stat(ndensity),
                          fill = variant,
                          color = variant),
                      alpha = 0.5, bandwidth = 1/64, scale = 1,
                      show.legend = FALSE) +
  ## adjust fill, x and y axes
  scale_x_log10(expand = c(0, 0),
                limits = c(9.95, 5.05e4),
                breaks = trans_breaks("log10", function(x) 10^x, n = 4),
                labels = trans_format("log10", math_format(10^.x))) +
  scale_fill_manual(values = paletteer_d("PNWColors::Bay", n = 7, type = "continuous")[c(1, 7, 2, 3, 4, 5, 6)]) +
  scale_color_manual(values = paletteer_d("PNWColors::Bay", n = 7, type = "continuous")[c(1, 7, 2, 3, 4, 5, 6)]) +
  ## add labels
  labs(x = "FVIII A2 domain-Alexa-647") +
  ## adjust legend location and features
  theme(axis.title.y = element_blank())

## save Fig. 6e
ggsave(here("outputs", "main_fig_panels", "6e_FVIII_vars_A2.pdf"),
       plot = FVIII_vars_012_plot, device = cairo_pdf,
       height = 35, width = 45, units = "mm")

###############################################################################

###############################################################################

## import FVIII data for A2 antibody
FVIII_wt_8001 <- list.files(path = here("inputs", "flow",
                                        "FVIII_wt_GMA8001",
                                        "scale"),
                            recursive = TRUE) %>%
  map_df(~read_flow_path(here("inputs", "flow",
                              "FVIII_wt_GMA8001", 
                              "scale", .))) %>%
  ## clean names
  clean_names() %>%
  ## rename Flowjo default variable names
  rename(ab8001 = alexa_fluor_647_a) %>%
  ## extract source path column to usable variables
  extract(col = source_path, into = c("variant"),
          regex = ".*HEK293F_(.*?)\\s.*\\.csv",
          remove = FALSE) %>%
  ## change names and order
  mutate(variant = case_when(variant == "LP" ~ "NC",
                             TRUE ~ variant),
         variant = factor(variant,
                          levels = c("NC", "FVIII")))

###############################################################################

## Fig. S14c - FVIII WT A3 epitope
FVIII_WT_8001_plot <- FVIII_wt_8001 %>%
  filter(ab8001 > 0) %>%
  ggplot() +
  geom_density(aes(x = ab8001,
                   y = after_stat(scaled),
                   fill = variant,
                   color = variant),
               alpha = 0.7, bw = 1/64) +
  ## adjust fill, x and y axes
  scale_x_log10(expand = c(0, 0),
                limits = c(0.05, 5.05e5),
                breaks = trans_breaks("log10", function(x) 10^x, n = 6),
                labels = trans_format("log10", math_format(10^.x))) +
  scale_y_continuous(expand = c(0, 0),
                     limits = c(-0.01, 1.01),
                     breaks = seq(0, 1, by = 0.25),
                     labels = function(x) as.character(x)) +
  scale_fill_manual(values = paletteer_d("PNWColors::Bay")[c(1, 5)]) +
  scale_color_manual(values = paletteer_d("PNWColors::Bay")[c(1, 5)]) +
  ## add labels
  labs(x = "FVIII A3 domain-Alexa-647",
       y = "Density") +
  ## adjust legend location and features
  theme(legend.position = "inside",
        legend.position.inside = c(0, 1),
        legend.justification = c(0, 1),
        legend.key.size = unit(3, "mm"),
        legend.title = element_blank(),
        legend.background = element_rect(fill = NA))

## save Fig. S14c
ggsave(here("outputs", "supp_fig_panels", "S14c_FVIII_WT_A3.pdf"),
       plot = FVIII_WT_8001_plot, device = cairo_pdf,
       height = 45, width = 35, units = "mm")

###############################################################################

## import FVIII data for C1 antibody
FVIII_vars_8040 <- list.files(path = here("inputs", "flow",
                                   "FVIII_vars_GMA8040",
                                   "scale"),
                             pattern = "*.csv",
                             recursive = TRUE) %>%
  map_df(~read_flow_path(here("inputs", "flow",
                              "FVIII_vars_GMA8040", 
                              "scale", .))) %>%
  ## clean names
  clean_names() %>%
  ## rename Flowjo default variable names
  rename(ab8040 = alexa_fluor_647_a) %>%
  ## extract source path column to usable variables
  extract(col = source_path, into = c("variant"),
          regex = ".*scale_8040_(.*?)_.*\\.csv",
          remove = FALSE) %>%
  ## changes names and order
  mutate(variant = case_when(variant == "LP" ~ "NC",
                             variant == "FRED" ~ "FVIII",
                             TRUE ~ variant),
         variant = factor(variant,
                          levels = c("NC", "FVIII", "R550C", "R2326P", "W2081C",
                                     "R2169H", "R301C")))

###############################################################################

## Fig. 6f - FVIII variants LC
FVIII_vars_8040_plot <- FVIII_vars_8040 %>%
  filter(ab8040 > 0) %>%
  ggplot() +
  geom_density_ridges(aes(x = ab8040,
                          y = fct_rev(variant),
                          height = after_stat(ndensity),
                          fill = variant,
                          color = variant),
                      alpha = 0.5, bandwidth = 1/64, scale = 1,
                      show.legend = FALSE) +
  ## adjust fill, x and y axes
  scale_x_log10(expand = c(0, 0),
                limits = c(9.95, 5.05e5),
                breaks = trans_breaks("log10", function(x) 10^x, n = 4),
                labels = trans_format("log10", math_format(10^.x))) +
  scale_fill_manual(values = paletteer_d("PNWColors::Bay", n = 7, type = "continuous")[c(1, 7, 2, 3, 4, 5, 6)]) +
  scale_color_manual(values = paletteer_d("PNWColors::Bay", n = 7, type = "continuous")[c(1, 7, 2, 3, 4, 5, 6)]) +
  ## add labels
  labs(x = "FVIII Light chain-Alexa-647") +
  ## adjust legend location and features
  theme(axis.title.y = element_blank())

## save Fig. 6f
ggsave(here("outputs", "main_fig_panels", "6f_FVIII_vars_LC.pdf"),
       plot = FVIII_vars_8040_plot, device = cairo_pdf,
       height = 35, width = 45, units = "mm")

###############################################################################

## import FVIII data for C1 antibody
FVIII_vars_8011 <- list.files(path = here("inputs", "flow",
                                   "FVIII_vars_GMA8011",
                                   "scale"),
                             pattern = "*.csv",
                             recursive = TRUE) %>%
  map_df(~read_flow_path(here("inputs", "flow",
                              "FVIII_vars_GMA8011", 
                              "scale", .))) %>%
  ## clean names
  clean_names() %>%
  ## rename Flowjo default variable names
  rename(ab8011 = alexa_fluor_647_a) %>%
  ## extract source path column to usable variables
  extract(col = source_path, into = c("variant"),
          regex = ".*scale_8011_(.*?)_.*\\.csv",
          remove = FALSE) %>%
  ## change order and names
  mutate(variant = case_when(variant == "LP" ~ "NC",
                             variant == "FRED" ~ "FVIII",
                             TRUE ~ variant),
         variant = factor(variant,
                          levels = c("NC", "FVIII", "R550C", "R2326P", "W2081C",
                                     "R2169H", "R301C")))

###############################################################################

## Fig. S14d - FVIII WT C1 epitope
FVIII_WT_8011_plot <- FVIII_vars_8011 %>%
  filter(ab8011 > 0,
         variant %in% c("NC", "FVIII")) %>%
  ggplot() +
  geom_density(aes(x = ab8011,
                   y = after_stat(scaled),
                   fill = variant,
                   color = variant),
               alpha = 0.7, bw = 1/64) +
  ## adjust fill, x and y axes
  scale_x_log10(expand = c(0, 0),
                limits = c(0.05, 5.05e5),
                breaks = trans_breaks("log10", function(x) 10^x, n = 6),
                labels = trans_format("log10", math_format(10^.x))) +
  scale_y_continuous(expand = c(0, 0),
                     limits = c(-0.01, 1.01),
                     breaks = seq(0, 1, by = 0.25),
                     labels = function(x) as.character(x)) +
  scale_fill_manual(values = paletteer_d("PNWColors::Bay")[c(1, 5)]) +
  scale_color_manual(values = paletteer_d("PNWColors::Bay")[c(1, 5)]) +
  ## add labels
  labs(x = "FVIII C1 domain-Alexa-647",
       y = "Density") +
  ## adjust legend location and features
  theme(legend.position = "inside",
        legend.position.inside = c(0, 1),
        legend.justification = c(0, 1),
        legend.key.size = unit(3, "mm"),
        legend.title = element_blank(),
        legend.background = element_rect(fill = NA))

## save Fig. S14d
ggsave(here("outputs", "supp_fig_panels", "S14d_FVIII_WT_C1.pdf"),
       plot = FVIII_WT_8011_plot, device = cairo_pdf,
       height = 45, width = 35, units = "mm")

###############################################################################

## Fig. 6g - FVIII variants C1
FVIII_vars_8011_plot <- FVIII_vars_8011 %>%
  filter(ab8011 > 0) %>%
  ggplot() +
  geom_density_ridges(aes(x = ab8011,
                          y = fct_rev(variant),
                          height = after_stat(ndensity),
                          fill = variant,
                          color = variant),
                      alpha = 0.5, bandwidth = 1/64, scale = 1,
                      show.legend = FALSE) +
  ## adjust fill, x and y axes
  scale_x_log10(expand = c(0, 0),
                limits = c(9.95, 1.05e6),
                breaks = trans_breaks("log10", function(x) 10^x, n = 4),
                labels = trans_format("log10", math_format(10^.x))) +
  scale_fill_manual(values = paletteer_d("PNWColors::Bay", n = 7, type = "continuous")[c(1, 7, 2, 3, 4, 5, 6)]) +
  scale_color_manual(values = paletteer_d("PNWColors::Bay", n = 7, type = "continuous")[c(1, 7, 2, 3, 4, 5, 6)]) +
  ## add labels
  labs(x = "FVIII C1 domain-Alexa-647") +
  ## adjust legend location and features
  theme(axis.title.y = element_blank())

## save Fig. 6g
ggsave(here("outputs", "main_fig_panels", "6g_FVIII_vars_C1.pdf"),
       plot = FVIII_vars_8011_plot, device = cairo_pdf,
       height = 35, width = 45, units = "mm")

###############################################################################

## import FVIII data for C2 antibody
FVIII_vars_8006 <- list.files(path = here("inputs", "flow",
                                   "FVIII_vars_GMA8006",
                                   "scale"),
                             pattern = "*.csv",
                             recursive = TRUE) %>%
  map_df(~read_flow_path(here("inputs", "flow",
                              "FVIII_vars_GMA8006", 
                              "scale", .))) %>%
  ## clean names
  clean_names() %>%
  ## rename Flowjo default variable names
  rename(ab8006 = alexa_fluor_647_a) %>%
  ## extract source path column to usable variables
  extract(col = source_path, into = c("variant"),
          regex = ".*scale_8006_(.*?)_.*\\.csv",
          remove = FALSE) %>%
  ## changes names and order
  mutate(variant = case_when(variant == "LP" ~ "NC",
                             variant == "FRED" ~ "FVIII",
                             TRUE ~ variant),
         variant = factor(variant,
                          levels = c("NC", "FVIII", "R550C", "R2326P", "W2081C",
                                     "R2169H", "R301C")))

###############################################################################

## Fig. S14e - FVIII WT C2 epitope
FVIII_WT_8006_plot <- FVIII_vars_8006 %>%
  filter(ab8006 > 0,
         variant %in% c("NC", "FVIII")) %>%
  ggplot() +
  geom_density(aes(x = ab8006,
                   y = after_stat(scaled),
                   fill = variant,
                   color = variant),
               alpha = 0.7, bw = 1/64) +
  ## adjust fill, x and y axes
  scale_x_log10(expand = c(0, 0),
                limits = c(0.95, 5.05e4),
                breaks = trans_breaks("log10", function(x) 10^x, n = 6),
                labels = trans_format("log10", math_format(10^.x))) +
  scale_y_continuous(expand = c(0, 0),
                     limits = c(-0.01, 1.01),
                     breaks = seq(0, 1, by = 0.25),
                     labels = function(x) as.character(x)) +
  scale_fill_manual(values = paletteer_d("PNWColors::Bay")[c(1, 5)]) +
  scale_color_manual(values = paletteer_d("PNWColors::Bay")[c(1, 5)]) +
  ## add labels
  labs(x = "FVIII C2 domain-Alexa-647",
       y = "Density") +
  ## adjust legend location and features
  theme(legend.position = "inside",
        legend.position.inside = c(0, 1),
        legend.justification = c(0, 1),
        legend.key.size = unit(3, "mm"),
        legend.title = element_blank(),
        legend.background = element_rect(fill = NA))

## save Fig. S14e
ggsave(here("outputs", "supp_fig_panels", "S14e_FVIII_WT_C2.pdf"),
       plot = FVIII_WT_8006_plot, device = cairo_pdf,
       height = 45, width = 35, units = "mm")

###############################################################################

## Fig. 6h - FVIII variants C2
FVIII_vars_8006_plot <- FVIII_vars_8006 %>%
  filter(ab8006 > 0) %>%
  ggplot() +
  geom_density_ridges(aes(x = ab8006,
                          y = fct_rev(variant),
                          height = after_stat(ndensity),
                          fill = variant,
                          color = variant),
                      alpha = 0.5, bandwidth = 1/64, scale = 1,
                      show.legend = FALSE) +
  ## adjust fill, x and y axes
  scale_x_log10(expand = c(0, 0),
                limits = c(9.95, 1.05e4),
                breaks = trans_breaks("log10", function(x) 10^x, n = 4),
                labels = trans_format("log10", math_format(10^.x))) +
  scale_fill_manual(values = paletteer_d("PNWColors::Bay", n = 7, type = "continuous")[c(1, 7, 2, 3, 4, 5, 6)]) +
  scale_color_manual(values = paletteer_d("PNWColors::Bay", n = 7, type = "continuous")[c(1, 7, 2, 3, 4, 5, 6)]) +
  ## add labels
  labs(x = "FVIII C2 domain-Alexa-647") +
  ## adjust legend location and features
  theme(axis.title.y = element_blank())

## save Fig. 6h
ggsave(here("outputs", "main_fig_panels", "6h_FVIII_vars_C2.pdf"),
       plot = FVIII_vars_8006_plot, device = cairo_pdf,
       height = 35, width = 45, units = "mm")

```

```{r technical replicates}

## Table S8 - Illumina PCR technical replicate correlations
PCR_correlation <- illumina_data %>%
  ## calculate Pearson's correlation on all samples
  group_by(sample) %>% 
  summarise(pear_cor = cor(rep1, rep2, method = "pearson",
                           use = "pairwise.complete.obs")) %>%
  mutate(pear_cor = round(pear_cor, digits = 2))

## Save Table S8 - Illumina PCR technical replicate correlations
write_csv(PCR_correlation, here("outputs", "tables",
                                "S8_technical_replicate_correlations.csv"))

###############################################################################

## calculate mean Illumina technical replicate correlation
mean_PCR_correlation <- PCR_correlation %>%
  summarise(mean_cor = mean(pear_cor)) %>%
  pull()

```

```{r other protein variants}

## F7 variants
F7_var_data <- list.files(path = here("inputs", "flow",
                                              "other_protein_variants", "F7",
                                              "scale"),
                          pattern = "*.csv",
                          recursive = TRUE) %>%
  map_df(~read_flow_path(here("inputs", "flow",
                              "other_protein_variants", "F7",
                              "scale", .))) %>%
  ## clean names
  clean_names() %>%
  ## rename Flowjo default variable names
  rename(abstrep = alexa_488_a,
         bfp = bv421_a,
         mcherry = m_cherry_a) %>%
  ## extract source path column to usable variables
  extract(col = source_path, into = c("gene_var", "fluor"),
          regex = ".*A_(.*)\\sStrepII_(.*)\\s\\+.csv") %>%
  separate(col = gene_var, into = c("gene", "variant"),
           sep = "\\s", fill = "left") %>%
  ## change names and order
  mutate(variant = case_when(variant == "LP" ~ "NC",
                             variant == "Var" ~ "C195R",
                             TRUE ~ variant),
         gene = case_when(variant == "NC" ~ variant,
                          TRUE ~ gene),
         ## change names and order
         variant = factor(variant, levels = c("NC", "WT", "C195R")))

## Fig. 6i - F7 secretion variant
F7_vars_plot <- F7_var_data %>%
  ## plot
  ggplot() +
  geom_density(aes(x = abstrep,
                   y = after_stat(scaled),
                   fill = variant,
                   color = variant),
               alpha = 0.5, bw = 1/64) +
  ## adjust fill, x and y axes
  scale_x_log10(limits = c(9.95, 1.05e5),
                breaks = trans_breaks("log10", function(x) 10^x, n = 4),
                labels = trans_format("log10", math_format(10^.x))) +
  scale_y_continuous(expand = c(0, 0),
                     limits = c(-0.01, 1.01),
                     breaks = seq(0, 1, by = 0.25),
                     labels = function(x) as.character(x)) +
  scale_fill_manual(values = paletteer_d("PNWColors::Bay")[c(1, 5, 3)]) +
  scale_color_manual(values = paletteer_d("PNWColors::Bay")[c(1, 5, 3)]) +
  ## add labels
  labs(x = "Strep II tag-Alexa-488",
       y = "Density") +
  ## adjust legend location and features
  theme(legend.key.size = unit(3, "mm"),
        legend.title = element_blank(),
        legend.background = element_rect(fill = NA))

## save Fig. 6i
ggsave(here("outputs", "main_fig_panels", "6i_FVII_variant.pdf"),
       plot = F7_vars_plot, device = cairo_pdf,
       height = 35, width = 55, units = "mm")

###############################################################################

## F10 and SERPINA1 variants
FX_SA_var_data <- list.files(path = here("inputs", "flow",
                                         "other_protein_variants", "F10_SA",
                                              "scale"),
                             pattern = "*.csv",
                             recursive = TRUE) %>%
  map_df(~read_flow_path(here("inputs", "flow",
                              "other_protein_variants", "F10_SA",
                              "scale", .))) %>%
  ## clean names
  clean_names() %>%
  ## rename Flowjo default variable names
  rename(abstrep = alexa_488_a) %>%
  ## extract source path column to usable variables
  extract(col = source_path, into = c("rep", "gene_var"),
          regex = ".*scale_rep([A-Z]).*ning_(.*)_.*\\+.csv") %>%
  separate(col = gene_var, into = c("gene", "variant"),
           sep = "_", fill = "left") %>%
  ## adjust so that no negative values remain and
  ## add small pseudocount for log plotting to prevent log(0) errors
  mutate(adj_abstrep = abstrep + abs(min(abstrep)) + 0.01,
         ## adjust labels
         variant = case_when(variant == "LP" ~ "NC",
                             TRUE ~ variant),
         gene = case_when(variant == "NC" ~ variant,
                          TRUE ~ gene))

###############################################################################

## Fig. 6j - F10 secretion variants
F10_vars_plot <- FX_SA_var_data %>%
  ## keep only F10 rep A
  filter(gene %in% c("F10", "NC"),
         rep == "A",
         abstrep > 0) %>%
  ## change order
  mutate(variant = factor(variant, levels = c("NC", "WT", "W348L", "T273M"))) %>%
  ## plot
  ggplot() +
  geom_density(aes(x = abstrep,
                   y = after_stat(scaled),
                   fill = variant,
                   color = variant),
               alpha = 0.5, bw = 1/64) +
  ## adjust fill, x and y axes
  scale_x_log10(limits = c(9.95, 1.05e5),
                breaks = trans_breaks("log10", function(x) 10^x, n = 4),
                labels = trans_format("log10", math_format(10^.x))) +
  scale_y_continuous(expand = c(0, 0),
                     limits = c(-0.01, 1.01),
                     breaks = seq(0, 1, by = 0.25),
                     labels = function(x) as.character(x)) +
  scale_fill_manual(values = paletteer_d("PNWColors::Bay")[c(1, 5, 4, 3)]) +
  scale_color_manual(values = paletteer_d("PNWColors::Bay")[c(1, 5, 4, 3)]) +
  ## add labels
  labs(x = "Strep II tag-Alexa-488",
       y = "Density") +
  ## adjust legend location and features
  theme(legend.key.size = unit(3, "mm"),
        legend.title = element_blank(),
        legend.background = element_rect(fill = NA))

## save Fig. 6j
ggsave(here("outputs", "main_fig_panels", "6j_FX_variant.pdf"),
       plot = F10_vars_plot, device = cairo_pdf,
       height = 35, width = 55, units = "mm")

###############################################################################

## INS and SERPING1
INS_SG_var_data <- list.files(path = here("inputs", "flow",
                                              "other_protein_variants", "INS_SG",
                                              "scale"),
                             pattern = "*.csv",
                             recursive = TRUE) %>%
  map_df(~read_flow_path(here("inputs", "flow",
                              "other_protein_variants", "INS_SG",
                              "scale", .))) %>%
  ## clean names
  clean_names() %>%
  ## rename Flowjo default variable names
  rename(abstrep = alexa_488_a,
         bfp = bv421_a,
         mcherry = m_cherry_a) %>%
  ## extract source path column to usable variables
  extract(col = source_path, into = c("gene_var", "fluor"),
          regex = ".*Cx_(.*)_(.*)\\s\\+.csv") %>%
  separate(col = gene_var, into = c("gene", "variant"),
           sep = "\\s", fill = "left") %>%
  ## change names and order
  mutate(variant = case_when(variant == "LP" ~ "NC",
                             variant == "Var" & gene == "Insulin" ~ "L30P",
                             variant == "Var" & gene == "SERPING" ~ "S150F",
                             TRUE ~ variant),
         gene = case_when(variant == "NC" ~ variant,
                          TRUE ~ gene))

## Fig. 6k - INS secretion variant
INS_vars_plot <- INS_SG_var_data %>%
  ## filter INS, values > 0 for log 
  filter(gene != "SERPING",
         abstrep > 0) %>%
  ## change order
  mutate(variant = factor(variant, levels = c("NC", "WT", "L30P"))) %>%
  ## plot
  ggplot() +
  geom_density(aes(x = abstrep,
                   y = after_stat(scaled),
                   fill = variant,
                   color = variant),
               alpha = 0.5, bw = 1/64) +
  ## adjust fill, x and y axes
  scale_x_log10(limits = c(9.95, 1.05e4),
                breaks = trans_breaks("log10", function(x) 10^x, n = 4),
                labels = trans_format("log10", math_format(10^.x))) +
  scale_y_continuous(expand = c(0, 0),
                     limits = c(-0.01, 1.01),
                     breaks = seq(0, 1, by = 0.25),
                     labels = function(x) as.character(x)) +
  scale_fill_manual(values = paletteer_d("PNWColors::Bay")[c(1, 5, 3)]) +
  scale_color_manual(values = paletteer_d("PNWColors::Bay")[c(1, 5, 3)]) +
  ## add labels
  labs(x = "Strep II tag-Alexa-488",
       y = "Density") +
  ## adjust legend location and features
  theme(legend.key.size = unit(3, "mm"),
        legend.title = element_blank(),
        legend.background = element_rect(fill = NA))

## save Fig. 6k
ggsave(here("outputs", "main_fig_panels", "6k_INS_variant.pdf"),
       plot = INS_vars_plot, device = cairo_pdf,
       height = 35, width = 55, units = "mm")

###############################################################################

## Fig. 6l - SG secretion variant
SG_vars_plot <- INS_SG_var_data %>%
  ## filter SG, values > 0 for log 
  filter(gene != "Insulin",
         abstrep > 0) %>%
  ## change order
  mutate(variant = factor(variant, levels = c("NC", "WT", "S150F"))) %>%
  ## plot
  ggplot() +
  geom_density(aes(x = abstrep,
                   y = after_stat(scaled),
                   fill = variant,
                   color = variant),
               alpha = 0.5, bw = 1/64) +
  ## adjust fill, x and y axes
  scale_x_log10(limits = c(9.95, 1.05e5),
                breaks = trans_breaks("log10", function(x) 10^x, n = 4),
                labels = trans_format("log10", math_format(10^.x))) +
  scale_y_continuous(expand = c(0, 0),
                     limits = c(-0.01, 1.01),
                     breaks = seq(0, 1, by = 0.25),
                     labels = function(x) as.character(x)) +
  scale_fill_manual(values = paletteer_d("PNWColors::Bay")[c(1, 5, 3)]) +
  scale_color_manual(values = paletteer_d("PNWColors::Bay")[c(1, 5, 3)]) +
  ## add labels
  labs(x = "Strep II tag-Alexa-488",
       y = "Density") +
  ## adjust legend location and features
  theme(legend.key.size = unit(3, "mm"),
        legend.title = element_blank(),
        legend.background = element_rect(fill = NA))

## save Fig. 6l
ggsave(here("outputs", "main_fig_panels", "6l_SG_variant.pdf"),
       plot = SG_vars_plot, device = cairo_pdf,
       height = 35, width = 55, units = "mm")

###############################################################################

## Fig. 6m - SA secretion variants
SA_vars_plot <- FX_SA_var_data %>%
  ## keep only SA rep A
  filter(gene %in% c("SERPINA", "NC"),
         rep == "B",
         abstrep > 0) %>%
  ## change order
  mutate(variant = factor(variant, levels = c("NC", "WT", "F251C", "I74N"))) %>%
  ## plot
  ggplot() +
  geom_density(aes(x = abstrep,
                   y = after_stat(scaled),
                   fill = variant,
                   color = variant),
               alpha = 0.5, bw = 1/64) +
  ## adjust fill, x and y axes
  scale_x_log10(limits = c(9.95, 1.05e5),
                breaks = trans_breaks("log10", function(x) 10^x, n = 4),
                labels = trans_format("log10", math_format(10^.x))) +
  scale_y_continuous(expand = c(0, 0),
                     limits = c(-0.01, 1.01),
                     breaks = seq(0, 1, by = 0.25),
                     labels = function(x) as.character(x)) +
  scale_fill_manual(values = paletteer_d("PNWColors::Bay")[c(1, 5, 4, 3)]) +
  scale_color_manual(values = paletteer_d("PNWColors::Bay")[c(1, 5, 4, 3)]) +
  ## add labels
  labs(x = "Strep II tag-Alexa-488",
       y = "Density") +
  ## adjust legend location and features
  theme(legend.key.size = unit(3, "mm"),
        legend.title = element_blank(),
        legend.background = element_rect(fill = NA))

## save Fig. 6m
ggsave(here("outputs", "main_fig_panels", "6m_SA_variant.pdf"),
       plot = SA_vars_plot, device = cairo_pdf,
       height = 35, width = 55, units = "mm")

## geometric mean for each SA variant
geomean_SA <- FX_SA_var_data %>%
  ## filter SA only
  filter(gene != "F10") %>%
  ## calculate geometric mean for each variant-replicate pair (thousands)
  group_by(variant, rep) %>%
  summarise(geomean_adj_abstrep = exp(mean(log(adj_abstrep))) / 1000) %>%
  ungroup()

## t.test
geomean_SA_F251C_tt <- geomean_SA %>%
  ## filter
  filter(variant %in% c("WT", "F251C")) %>%
  summarise(tt = t.test(geomean_adj_abstrep) %>%
              tidy())

## t.test
geomean_SA_I74N_tt <- geomean_SA %>%
  ## filter
  filter(variant %in% c("WT", "I74N")) %>%
  summarise(tt = t.test(geomean_adj_abstrep) %>%
              tidy())

```

```{r gnomad}

## import gnomad data on all FIX variants
gnomad <- read_csv(here("inputs", "gnomad", "gnomad4_f9_missense.csv")) %>%
  ## clean variant names
  separate_wider_regex(cols = variant,
                       patterns = c("p.",
                                    wt_aa = "[A-z]{3}",
                                    position = "[0-9]{1,3}",
                                    var_aa = ".*")) %>%
  mutate(position = as.numeric(position),
         wt_aa = unlist(mget(wt_aa, hash_aa3_to_aa1@.xData)),
         var_aa = unlist(mget(var_aa, hash_aa3_to_aa1@.xData)),
         variant = paste0(wt_aa, as.character(position), var_aa)) %>%
  ## remove variants in non-hemizygous state
  filter(n_hemizygote_xy > 0) %>%
  ## join with scored variants and predictions
  left_join(variant_classification, by = c("variant", "position")) %>%
  left_join(new_predictions_missense %>%
              select(-starts_with("ab")),
            by = c("wt_aa", "position", "var_aa", "variant"))

## gnomad table by MultiSTEP prediction
prediction_table_gnomad <- gnomad %>%
  filter(!is.na(label)) %>%
  tabyl(label) %>%
  adorn_totals("row") %>%
  adorn_pct_formatting(digits = 2)

## Fig. 5g - gnomAD hemizygous frequency x model evaluation
gnomad_plot <- gnomad %>%
  ## cleanup label
  mutate(label = case_when(label == "WT-like" ~ "Normal function",
                           label == "Loss of function" ~ "Abnormal function")) %>%
  ## remove NA
  filter(!is.na(maf_xy),
         !is.na(label)) %>%
  ## plot
  ggplot(aes(x = maf_xy,
             fill = label)) + 
  # add histogram
  geom_histogram(bins = 25, color = "black", linewidth = 0.3) + 
  ## add prevalence line
  geom_vline(aes(xintercept = 1/20000), linetype = "dashed") +
  ## scale axes
  scale_x_log10(expand = c(0, 0),
                limits = c(8e-7, 1.5e0),
                breaks = trans_breaks("log10", function(x) 10^x, n = 4),
                labels = trans_format("log10", math_format(10^.x))) +
  scale_y_continuous(expand = c(0, 0),
                     limits = c(-0.01, 61),
                     breaks = seq(0, 60, by = 20),
                     labels = function(x) as.character(x)) +
  ## scale fill
  scale_fill_manual(values = c("#b2182bFF", "#2166acFF")) +
  ## add labs
  labs(x = "MAF in hemizygotes",
       y = "Number of variants") +
  ## adjust legend location and features
  theme(legend.position = "inside",
        legend.position.inside = c(0.65, 0.88),
        legend.key.size = unit(3, "mm"),
        legend.title = element_blank(),
        legend.background = element_rect(fill = NA))

## save Fig. 5g
ggsave(here("outputs", "main_fig_panels", "5g_gnomad.pdf"),
       plot = gnomad_plot, device = cairo_pdf,
       height = 45, width = 40, units = "mm")

```

```{r consurf}

## read in conservation score data from Consurf
consurf <- read_csv(here("inputs", "consurf", "consurf_values.csv")) %>%
  ## fix names
  clean_names() %>%
  ## rename because "function" is not good to use as variable
  rename(conserved = `function`) %>%
  ## remove epitope
  filter(!pos %in% light_chain_epitopes) %>%
  ## add scores
  left_join(scored_variants_average %>%
              filter(variant != "WT",
                     wt_aa != var_aa,
                     var_aa != "X",
                     antibody == "124",
                     !is.na(average_score2)),
            by = c("pos" = "position")) %>%
  ## alter values to be more readable
  mutate(lowconf = case_when(grepl("\\*", color) == TRUE ~ "fail",
                             TRUE ~ "pass"),
         color = gsub("\\*", "", color),
         conserved = case_when(is.na(conserved) == TRUE ~ "not conserved",
                               conserved == "f" ~ "conserved, exposed",
                               conserved == "s" ~ "conserved, buried"),
         is_low = case_when(average_score2 >= syn_threshold_wide %>%
                              select(lower_124) %>%
                              pull() ~ "WT-like secretion",
                            TRUE ~ "Low secretion")) %>%
  ## remove variants with low confidence conservation
  filter(lowconf != "fail")

###############################################################################

## low consurf variants
consurf_low <- consurf %>% 
  tabyl(is_low, color) %>%
  adorn_totals("row") %>%
  adorn_percentages("col") %>%
  adorn_pct_formatting(digits = 2) %>%
  adorn_ns()

## consurf low stats
consurf_low_stat <- consurf %>%
  mutate(group = case_when(color %in% c(7, 8, 9) ~ "high_cons",
                           color %in% c(4, 5, 6) ~ "med_cons",
                           color %in% c(1, 2, 3) ~ "low_cons")) %>%
  filter(group != "med_cons") %>%
  tabyl(group, is_low)

## Fisher's test for secretion vs. conservation
fisher_consurf <- fisher.test(consurf_low_stat)

## add totals and percentages to consurf_low_stat
consurf_low_stat_pct <- consurf_low_stat %>%
  adorn_totals("col") %>%
  adorn_percentages() %>%
  adorn_pct_formatting(digits = 2) %>%
  adorn_ns()

###############################################################################

## identify the low conservation variants that have severe secretion deficits
consurf_lowcons_lowsec <- consurf %>%
  ## keep only poorly conserved
  filter(color %in% c(1, 2, 3)) %>%
  ## and only poorly secreted
  filter(average_score2 < syn_threshold_wide %>%
           pull(lower_124)) %>%
  ## classify
  mutate(class = case_when(average_score2 < 0.1 ~ "severe",
                           average_score2 >= 0.1 & average_score2 <= 0.5 ~ "moderate",
                           average_score2 > 0.5 ~ "mild"))

## count low conservation effects on secretion
consurf_lowcons_lowsec_stat <- consurf_lowcons_lowsec %>%
  tabyl(class) %>%
  adorn_totals()

## identify low conservation, severe loss of secretion positions
consurf_lowcons_severe <- consurf_lowcons_lowsec %>%
  filter(class == "severe") %>%
  group_by(pos) %>%
  count()

## identify WT-like secretion positions with high conservation
consurf_high <- consurf %>%
  ## keep only highly conserved
  filter(color %in% c(7, 8, 9)) %>%
  ## and only well secreted
  filter(average_score2 > syn_threshold_wide %>%
           pull(lower_124)) %>%
  group_by(pos, seq) %>%
  count()

###############################################################################
  
## Fig. S8a - consurf plot by variant
consurf_plot <- consurf %>%
  ## remove low confidence positions
  filter(lowconf != "fail") %>%
  ## plot
  ggplot(aes(x = color,
             y = average_score2)) + 
  ## add synonymous threshold
  geom_hline(data = syn_threshold %>% filter(antibody == "124"), 
             aes(yintercept = lower),
             linetype = "dashed") +
  ## add points
  geom_jitter(width = 0.3, alpha = 0.1, size = 0.5, color = "steelblue") +
  ## add violin
  geom_violin(fill = NA, scale = "width", linewidth = 0.3,
              color = "black", adjust = 2) +
  ## add boxplot
  geom_boxplot(color = "black", fill = NA, width = 0.2, linewidth = 0.3,
               outliers = FALSE) +
  ## scale axes
  scale_y_continuous(expand = c(0, 0),
                     limits = c(-0.02, 1.27),
                     breaks = seq(0, 1.25, by = 0.25),
                     labels = function(x) as.character(x)) +
  ## nice labels
  labs(x = "ConSurf grade",
       y = "Light chain secretion score")

## save Fig. S8a
ggsave(here("outputs", "supp_fig_panels", "S8a_consurf_variant.pdf"),
       plot = consurf_plot, device = cairo_pdf,
       height = 44, width = 88, units = "mm")

###############################################################################

## consurf median effect
consurf_median <- consurf %>%
  ## remove low confidence positions
  filter(lowconf != "fail") %>%
  ## calculate median effect
  group_by(pos) %>%
  summarise(med_effect = median(average_score2, na.rm = TRUE),
            color = unique(color),
            seq = unique(seq)) %>%
  ungroup()

## find positions with high median secretion and high conservation
consurf_median_high <- consurf_median %>%
  filter(med_effect > syn_threshold %>%
           filter(antibody == "124") %>%
           pull(lower),
         color %in% c(7, 8, 9))

###############################################################################

## Fig. S8b - consurf by median effect on position
consurf_median_plot <- consurf_median %>%
  ## plot
  ggplot(aes(x = color,
             y = med_effect)) + 
  ## add synonymous threshold
  geom_hline(data = syn_threshold %>% filter(antibody == "124"), 
             aes(yintercept = lower),
             linetype = "dashed") +
  ## add points
  geom_jitter(width = 0.3, alpha = 0.7, size = 0.5, color = "sienna3") +
  ## add violin
  geom_violin(fill = NA, scale = "width", linewidth = 0.3,
              color = "black", adjust = 2) +
  ## add boxplot
  geom_boxplot(color = "black", fill = NA, width = 0.2, linewidth = 0.3,
               outliers = FALSE) +
  ## scale axes
  scale_y_continuous(expand = c(0, 0),
                    limits = c(-0.02, 1.27),
                    breaks = seq(0, 1.25, by = 0.25),
                    labels = function(x) as.character(x)) +
  ## nice labels
  labs(x = "ConSurf grade",
       y = "Median light chain secretion score")

## save Fig. S8b
ggsave(here("outputs", "supp_fig_panels", "S8b_consurf_median.pdf"),
       plot = consurf_median_plot, device = cairo_pdf,
       height = 44, width = 88, units = "mm")

```

```{r VEPs}

## alpha missense
alpha_missense <- read_tsv(here("inputs", "VEPs", "AlphaMissense-Search-P00740.tsv")) %>%
  ## clean names
  clean_names() %>%
  ## rename columns
  rename(wt_aa = "a_a_1",
         var_aa = "a_a_2",
         am_path_score = "pathogenicity_score",
         am_path_class = "pathogenicity_class") %>%
  ## make variant
  mutate(variant = paste0(wt_aa, position, var_aa))

###############################################################################

## get genomic coordinates of F9
gen <- read_csv(here("inputs", "coordinates", "genome_sequence_f9.csv"),
                col_names = "sequence") %>%
  mutate(seq_nm = paste0("seq", row_number())) %>%
  pivot_wider(names_from = seq_nm, values_from = sequence) %>%
  unite("seq", everything(), sep = "") %>%
  pull()

## create table of genomic coordinates
gen_num <- tibble(nt = unlist(str_extract_all(gen, boundary("character"))),
                  gen_position = seq(139530765, 139530764 + nchar(gen), by = 1))

## keep only exons
exon_nums <- gen_num %>%
  filter(grepl("[[:upper:]]", nt)) %>%
  ## calculate position in FIX protein
  mutate(position = ceiling(row_number() / 3))

## revel
revel <- read_csv(here("inputs", "VEPs", "F9_revel.csv")) %>%
  ## join with genomic coordinates, SNVs (REVEL only calls on SNV)
  left_join(exon_nums, by = c("grch38_pos" = "gen_position",
                              "ref" = "nt")) %>%
  ## rename columns
  rename("wt_aa" = "aaref",
         "var_aa" = "aaalt") %>%
  ## average REVEL scores for multiple SNVs
  group_by(wt_aa, position, var_aa) %>%
  mutate(revel_score = mean(REVEL, na.rm = TRUE)) %>%
  ungroup() %>%
  ## keep only distinct
  select(wt_aa, position, var_aa, revel_score) %>%
  distinct() %>%
  ## classify REVEL scores at level = 0.5
  mutate(revel_class = case_when(revel_score > 0.5 ~ "likely_pathogenic",
                                 revel_score == 0.5 ~ "ambiguous",
                                 revel_score < 0.5 ~ "likely_benign"),
         variant = paste0(wt_aa, position, var_aa))

###############################################################################

## eve
eve <- read_csv(here("inputs", "VEPs", "EVE_FA9_HUMAN.csv")) %>%
  ## create variant
  mutate(variant = paste0(wt_aa, position, var_aa)) %>%
  ## keep only scores and 75th percent retained (recommended cutoff)
  select(variant, EVE_scores_ASM, EVE_classes_75_pct_retained_ASM)

###############################################################################

## cadd (1.5)
cadd <- read_csv(here("inputs", "VEPs", "F9_cadd.csv")) %>%
  ## clean up column name
  rename("cadd_score" = "CADD")

###############################################################################

## VEPs combined
VEPs_all <- eve %>%
  ## add revel
  full_join(revel %>%
              select(variant, revel_score, revel_class),
            by = "variant") %>%
  ## add alpha missense
  full_join(alpha_missense %>%
              select(variant, contains("am_path")),
            by = "variant") %>%
  ## add cadd
  full_join(cadd, by = "variant") %>%
  ## add scores
  left_join(new_predictions, by = "variant")

###############################################################################
  
## VEP correlations
VEP_corr <- VEPs_all %>%
  ## longer format
  pivot_longer(cols = c("EVE_scores_ASM", "revel_score", "am_path_score", "cadd_score"),
               names_to = "VEP",
               values_to = "VEP_score") %>%
  pivot_longer(cols = starts_with("ab"),
               names_to = "antibody", 
               values_to = "functional_score") %>%
  group_by(antibody, VEP) %>%
  ## calculate correlation
  summarise(correlation = cor(functional_score, VEP_score,
                              use = "pairwise.complete.obs")) %>%
  ungroup() %>%
  ## make positive
  mutate(correlation = abs(correlation)) 

## Fig. S13a - VEP correlation plot
VEP_corr_plot <- VEP_corr %>%
  ## better antibody labels
  left_join(antibody_table %>%
               mutate(antibody = paste0("ab", antibody),
                      antibody_label2 = gsub("antibody", "", antibody_label2),
                      antibody_label2 = case_when(grepl("Carboxy", antibody_label2) ~ paste0(antibody_label2, "\ncarboxylation score"),
                                                  TRUE ~ paste0(antibody_label2, "\nsecretion score")),
                      antibody_label2 = gsub("Carboxylation-sensitive\n", "", antibody_label2)),
            by = "antibody") %>%
  ## change VEP labels
  mutate(VEP = case_when(VEP == "am_path_score" ~ "AlphaMissense",
                         VEP == "cadd_score" ~ "CADD",
                         VEP == "EVE_scores_ASM" ~ "EVE",
                         VEP == "revel_score" ~ "REVEL"),
         VEP = factor(VEP, levels = c("EVE", "AlphaMissense", "REVEL", "CADD"))) %>%
  ## plot
  ggplot(aes(x = correlation,
             y = fct_rev(antibody_label2),
             fill = fct_rev(VEP))) + 
  geom_col(position = "dodge", color = "white", linewidth = 0.1) +
  scale_x_continuous(expand = c(0, 0),
                     limits = c(0, 0.71),
                     breaks = seq(0, 0.7, by = 0.1),
                     labels = function(x) as.character(x)) +
  scale_fill_manual(values = paletteer_d("PNWColors::Moth")[c(4, 3, 2, 1)]) +
  labs(x = "Spearman correlation",
       y = "MultiSTEP functional score")  +
  guides(fill = guide_legend(reverse = TRUE, nrow = 2)) +
  theme(legend.title = element_blank(),
        legend.background = element_rect(fill = NA),
        legend.location = "plot",
        legend.key.size = unit(4, "mm"),
        legend.position = "bottom")

## save Fig. S13a - correlation plot
ggsave(here("outputs", "supp_fig_panels", "S13a_VEP_correlation.pdf"),
       plot = VEP_corr_plot, device = cairo_pdf,
       height = 71, width = 50, units = "mm")

###############################################################################

## threshold table
threshold_table <- tibble(predictor = c("AlphaMissense", "EVE", "CADD", "REVEL"),
                          low_threshold = c(0.34, 0.359, 10, 0.5),
                          high_threshold = c(0.564, 0.641, 20, NA))

## make limits command
xlims <- if (curated_plot_by_vep$predictor == "CADD") {
  scale_x_continuous(expand = c(0, 0),
                     limits = c(-1, 41),
                     breaks = seq(0, 40, by = 10),
                     labels = function(x) as.character(x))
  } else {
    scale_x_continuous(expand = c(0, 0),
                       limits = c(-0.025, 1.025),
                       breaks = seq(0, 1, by = 0.25),
                       labels = function(x) as.character(x))
  }

## Fig. S13b - per VEP plot for separation of benign and pathogenic
curated_plot_by_vep <- curated_clinical_variants %>%
  ## make distinct
  select(variant, pathogenicity) %>%
  distinct() %>%
  ## create labels 
  left_join(VEPs_all %>% 
              select(variant, contains("score")), by = "variant") %>%
  ## make long format
  pivot_longer(cols = contains("score"),
               names_to = "predictor",
               names_pattern = "(.*?)_.*",
               values_to = "score") %>%
  ## make better predictor names
  mutate(predictor = case_when(predictor == "am" ~ "AlphaMissense",
                               TRUE ~ toupper(predictor)),
         predictor = factor(predictor, levels = c("AlphaMissense", "CADD", "EVE", "REVEL")),
         pathogenicity = case_when(pathogenicity == "Pathogenic" ~ "Pathogenic (P/LP)",
                                   pathogenicity == "Benign" ~ "Benign (B/LB)")) %>%
  ## count variants, removing NA
  filter(!is.na(score)) %>%
  group_by(predictor) %>%
  mutate(n = n(),
         n = paste0("n = ", n)) %>%
  ungroup() %>%
  ## join with thresholds
  left_join(threshold_table, by = "predictor") %>%
  ## nest by labeling variable
  group_by(predictor) %>% 
  nest() %>%
  ## create plots with map2 function
  mutate(plot = map2(data, predictor,
                     ~ggplot(data = .x,
                             aes(x = score,
                                 fill = pathogenicity)) +
                       ## histogram of scores
                       geom_histogram(bins = 20, color = "black", linewidth = 0.3) +
                       ## thresholds
                       geom_vline(aes(xintercept = unique(low_threshold)),
                                  linetype = "dashed") +
                       geom_vline(aes(xintercept = unique(high_threshold)),
                                  linetype = "dashed") +
                       ## add n
                       annotate(geom = "text",
                                x = if(predictor != "CADD") c(0.1) else c(4),
                                y = 58,
                                label = unique(.$n),
                                size = 6 / .pt) +
                       ## adjust x and y axes
                       scale_x_continuous(expand = c(0, 0),
                                          limits = if(predictor != "CADD") c(-0.05, 1.05) else c(-1, 41),
                                          breaks = if(predictor != "CADD") seq(0, 1, by = 0.25) else seq(0, 40, by = 10),
                                          labels = function(x) as.character(x)) +
                       scale_y_continuous(expand = c(0, 0),
                                          limits = c(-0.5, 60.5),
                                          breaks = seq(0, 60, by = 10),
                                          labels = function(x) as.character(x)) +
                       ## adjust fill colors 
                       scale_fill_manual(values = c("#2166acFF","#b2182bFF")) +
                       ## add titles
                       labs(x = paste(predictor, "score"),
                            y = "Number of variants",
                            title = predictor))) %>%
  arrange(predictor) %>%
  ## extract only plots into list
  pull(plot) %>%
  ## arrange list of plots
  wrap_plots(guides = "collect", nrow = 1) &
  theme(legend.key.size = unit(3, "mm"),
        legend.title = element_blank())

## save fig. S13b
ggsave(here("outputs", "supp_fig_panels", "S13b_clinical_vars_by_vep.pdf"),
       plot = curated_plot_by_vep, device = cairo_pdf,
       height = 45, width = 207, units = "mm")
    
```

```{r compare performance of VEPs to MultiSTEP RF}

## isolate only test set variants from RF model
VEPs_test_set <- clinical_test %>%
  select(variant, pathogenicity) %>%
  left_join(VEPs_all %>%
              select(variant, contains("score"), contains("class")),
            by = "variant") %>%
  ## create classification for CADD, based on cutoff of 20 as used in MLOF
  ## CADD = 10-20 used for further investigation (ambiguous) in MLOF
  mutate(cadd_class = case_when(cadd_score <= 10 ~ "likely benign",
                                cadd_score > 10 & cadd_score < 20 ~ "ambiguous",
                                cadd_score > 20 ~ "likely pathogenic")) %>%
  rename("EVE" = "EVE_classes_75_pct_retained_ASM",
         "AlphaMissense" = "am_path_class",
         "CADD" = "cadd_class",
         "REVEL" = "revel_class",
         "MultiSTEP" = ".pred_class",
         "correct_class" = "pathogenicity") %>%
  select(-contains("score")) %>%
  pivot_longer(cols = c("EVE", "AlphaMissense", "CADD", "REVEL", "MultiSTEP"),
               names_to = "predictor",
               values_to = "predicted_class") %>%
  replace_na(list(predicted_class = "missing")) %>%
  mutate(predicted_class = gsub("Uncertain", "ambiguous", predicted_class),
         predicted_class = gsub("likely|likely |likely_", "", predicted_class),
         across(contains("class"), ~tolower(.x)),
         result = case_when(
           correct_class == "pathogenic" &
             predicted_class == "pathogenic" ~
             "Pathogenic (P/LP) variant\nwith concordant prediction",
           correct_class == "pathogenic" &
             predicted_class == "benign" ~
             "Pathogenic (P/LP) variant\nwith discordant prediction",
           correct_class == "benign" &
             predicted_class == "benign" ~
             "Benign (B/LB) variant\nwith concordant prediction",
           correct_class == "benign" &
             predicted_class == "pathogenic" ~
             "Benign (B/LB) variant\nwith discordant prediction",
           predicted_class == "ambiguous" ~ "Pathogenic (P/LP) variant\n with uncertain prediction",
           predicted_class == "missing" ~ "Not predicted"),
         result = factor(result,
                         levels = c("Not predicted",
                                    "Pathogenic (P/LP) variant\n with uncertain prediction",
                                    "Benign (B/LB) variant\nwith discordant prediction",
                                    "Pathogenic (P/LP) variant\nwith discordant prediction",
                                    "Benign (B/LB) variant\nwith concordant prediction",
                                    "Pathogenic (P/LP) variant\nwith concordant prediction")),
         predictor = factor(predictor, levels = c("MultiSTEP",
                                                  "AlphaMissense", "EVE",
                                                  "CADD", "REVEL")),
         correct_class = case_when(correct_class == "pathogenic" ~ "Pathogenic\n(P/LP)",
                                   correct_class == "benign" ~ "Benign\n(B/LB)"))

## collect metrics for each predictor
VEP_metrics <- VEPs_test_set %>%
  ## make truth and estimate factors
  ## ambiguous or missing are changed to "opposite" class of correct_class for 
  ## performance calculations since they are still wrong relative to the true class
  mutate(truth = factor(correct_class, levels = c("pathogenic", "benign")),
         estimate = case_when(predicted_class == "missing" & correct_class == "benign" ~ "pathogenic",
                              predicted_class == "missing" & correct_class == "pathogenic" ~ "benign",
                              predicted_class == "ambiguous" & correct_class == "benign" ~ "pathogenic",
                              predicted_class == "ambiguous" & correct_class == "pathogenic" ~ "benign",
                              TRUE ~ predicted_class),
         estimate = factor(estimate, levels = c("pathogenic", "benign"))) %>%
  group_by(predictor) %>%
  collect_performance1(truth = truth,
                       estimate = estimate)

## Fig. S13c - plot test set variants with threshold
VEPs_test_plot <- VEPs_test_set %>%
  ## plot
  ggplot(aes(x = correct_class,
             fill = result,
             pattern_fill = result)) + 
  ## add patterned bar chart
  geom_bar_pattern(position = "stack",
                   alpha = 0.8, color = "white", linewidth = 0.5,
                   pattern_density = 0.4, pattern_spacing = 0.05,
                   pattern_color = NA) +
  ## add metrics
  geom_label(data = VEP_metrics %>%
               mutate(.metric = case_when(.metric == "ppv" | .metric == "npv" ~ toupper(.metric),
                                          .metric == "sens" ~ "Sens",
                                          .metric == "spec" ~ "Spec"),
                      label1 = paste0(.metric, ": ", round(.estimate, digits = 2))) %>%
               select(-.estimate) %>%
               pivot_wider(names_from = .metric,
                           values_from = label1) %>%
               mutate(label = paste(PPV, NPV, Spec, Sens, sep = "\n")),
            aes(x = 0.47,
                y = 34,
                label = label), 
            color = "black", inherit.aes = FALSE, size = 6 / .pt, hjust = 0, vjust = 1) +
  ## adjust axes, fill, and pattern
  scale_pattern_fill_manual(values = c(NA, "black", NA, "black", "black", "black"),
                            breaks = c("Benign (B/LB) variant\nwith concordant prediction",
                                       "Benign (B/LB) variant\nwith discordant prediction",
                                       "Pathogenic (P/LP) variant\nwith concordant prediction",
                                       "Pathogenic (P/LP) variant\nwith discordant prediction",
                                       "Pathogenic (P/LP) variant\n with uncertain prediction",
                                       "Not predicted")) +
  scale_fill_manual(values = c( "#2166acFF", "#2166acFF","#b2182bFF",
                                "#b2182bFF", "goldenrod", "grey50"),
                    breaks = c("Benign (B/LB) variant\nwith concordant prediction",
                               "Benign (B/LB) variant\nwith discordant prediction",
                               "Pathogenic (P/LP) variant\nwith concordant prediction",
                               "Pathogenic (P/LP) variant\nwith discordant prediction",
                               "Pathogenic (P/LP) variant\n with uncertain prediction",
                               "Not predicted")) +
  scale_y_continuous(expand = c(0, 0),
                     limits = c(0, 35),
                     breaks = seq(0, 35, by = 5),
                     labels = function(x) as.character(x)) +
  ## facet over each predictor
  facet_grid(cols = vars(predictor), space = "free") +
  ## adjust legend
  guides(pattern_fill = guide_legend(override.aes = list(pattern_spacing = 0.01),
                                     color = "black", nrow = 2)) +
  ## adjust themes
  theme(legend.key.size = unit(4, "mm"),
        legend.title = element_blank(),
        legend.position = "bottom",
        panel.spacing.x = unit(0, "line"),
        strip.placement = "outside") +
  ## add titles
  labs(x = "\nCurated variant classification",
       y = "Number of variants")

## save Fig. S13c - accuracy plot
ggsave(here("outputs", "supp_fig_panels", "S13c_VEP_accuracy.pdf"),
       plot = VEPs_test_plot, device = cairo_pdf,
       height = 80, width = 130, units = "mm")

```