-
Notifications
You must be signed in to change notification settings - Fork 0
/
0-common_first_steps.R
234 lines (165 loc) · 7.38 KB
/
0-common_first_steps.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
#' ---
#' title: "Preprocessing of Top-Down Data for 103 Human Subjects"
#' output:
#' BiocStyle::html_document:
#' toc: true
#' number_sections: true
#' highlight: haddock
#' ---
#+ echo=F
knitr::opts_chunk$set(echo=F, message=F, warning=F, fig.align='center')
#, out.width='10cm')
#+ libraries
library(tidyverse)
library(TopPICR) # remotes::install_github("evanamartin/TopPICR", ref="0.0.3")
library(memoise)
library(MSnbase)
library(MSnSet.utils)
library(PNNL.DMS.utils)
# just in case
if(!dir.exists("output_data"))
dir.create("output_data")
# # memoisation of the slow steps
# augment_annotation <- memoise(augment_annotation, cache = cache_filesystem("./caching"))
# form_model <- memoise(form_model, cache = cache_filesystem("./caching"))
# align_rt <- memoise(align_rt, cache = cache_filesystem("./caching"))
# calc_error <- memoise(calc_error, cache = cache_filesystem("./caching"))
# cluster <- memoise(cluster, cache = cache_filesystem("./caching"))
# create_pcg <- memoise(create_pcg, cache = cache_filesystem("./caching"))
# create_mdata <- memoise(create_mdata, cache = cache_filesystem("./caching"))
if("toppic_output.RData" %in% list.files(path = "./source_data/")){
load("./source_data/toppic_output.RData")
}else{
#data set with max mod = 4000 Da
toppic_output <- read_TopPIC_DMS(data_package_num = 5149)
save(toppic_output, file = "./source_data/toppic_output.RData")
}
ids <- toppic_output$ms2identifications
feat <- toppic_output$ms1features
# remove NA annotations. We can't handle them at the FDR filter tuning step
# Those are non-uniprot IDs, like short ORFs and contaminants
ids <- ids %>%
dplyr::filter(!is.na(AnnType))
# add CVs
# Note, CVs are encoded in the dataset by consecutive numbering.
# I believe there is no table converting them from consecutive number to CV.
# In this case it has been sorted out by mass ranges. Also, the conversion was
# consistent across the datasets. In the future, we need a non-manual approach
# for mapping the suffixes to CVs.
cv_codes <- c("0" = "-50", "1" = "-40", "2" = "-30")
ids <- ids %>%
mutate(CV = as.character(cv_codes[sub(".*(\\d)$","\\1",Dataset)])) %>%
mutate(Dataset = sub("(.*)_\\d$","\\1",Dataset))
feat <- feat %>%
mutate(CV = as.character(cv_codes[sub(".*(\\d)$","\\1",Dataset)])) %>%
mutate(Dataset = sub("(.*)_\\d$","\\1",Dataset))
#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!
#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!
# Identified feature steps
#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!
#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!
# Remove erroneous genes ---------------
# What does this do? In case a proteoform assigned to multiple genes,
# this function selects the gene with the lowest E-value.
x_err <- rm_false_gene(ids) #rename !!!
# getting protLength (parent protein length)
# No need to match. Just need to link accessions.
# 1. read fasta and
# 2. link by `Protein accession` or UniProtAcc
if(!file.exists("./source_data/ID_008032_8627C6BD.fasta")){
file.copy(file.path(r"(\\gigasax\DMS_FASTA_File_Archive\Dynamic\Forward\)",
"ID_008032_8627C6BD.fasta"), "./source_data/")
}
fst <- Biostrings::readAAStringSet("./source_data/ID_008032_8627C6BD.fasta")
add_protein_length <- function(x, fst_obj){
fst_df <- data.frame(`Protein accession` = names(fst_obj),
protLength = BiocGenerics::width(fst_obj),
row.names = NULL,
check.names = FALSE) %>%
mutate(`Protein accession` = word(`Protein accession`))
x <- inner_join(x, fst_df)
return(x)
}
x_err <- add_protein_length(x_err, fst)
compute_fdr(x_err)
# Filter by cleanSeq count ---------------
# really minimal filter
x_fbc <- filter_by_count(x = x_err,
count_within = c("Dataset", "Scan(s)"),
count = "cleanSeq",
threshold = 2) # better to switch to 3 and drop the E-value filter altogether.
compute_fdr(x_fbc)
# FDR control ---------------
# First find the E-value threshold for each of the three annotation types.
# FDR is calculated at Gene level.
the_cutoff <- find_evalue_cutoff(x = x_fbc,
fdr_threshold = 0.01)
# Apply the actual FDR control/filter with the threshold values from above.
x_fdr <- apply_evalue_cutoff(x = x_fbc,
e_vals = the_cutoff)
compute_fdr(x_fdr)
# Proteoform inference ---------------
x_ipf <- infer_prot(x_fdr)
compute_fdr(x_ipf)
# I think this concludes polishing the MS/MS identifications
# Align retention time ---------------
# Create the model that will be used to align each retention time. The model is
# created between a reference data set and all other data sets.
the_model <- form_model(
x = x_ipf,
ref_ds = find_ref_ds(x = x_ipf),
control = loess.control(surface = "direct"), # Use direct to avoid NAs.
span = 0.5,
family = "symmetric")
# Align retention times according to the model created previously.
x_art <- align_rt(
x = x_ipf,
model = the_model,
var_name = "Feature apex")
# Recalibrate the mass ---------------
# Calculate the error between the `Precursor mass` and the `Adjusted precursor
# mass`. This acts as the model for recalibrating the mass. A reference data set
# is not used when calculating the mass error model.
the_error <- calc_error(x = x_art,
ref_ds = find_ref_ds(x = x_ipf))
# Recalibrate the mass according the the errors computed previously.
x_rcm <- recalibrate_mass(x = x_art,
errors = the_error,
var_name = "Precursor mass")
# Cluster ---------------
# Key step. But doesn't take too long.
x_clu <- cluster(x = x_rcm,
errors = the_error,
method = "single",
height = 1.5,
min_size = 3)
# Group clusters ---------------
x_grp <- create_pcg(x = x_clu,
errors = the_error,
n_mme_sd = 4/the_error$ppm_sd, # 5.4 st.devs
# ppm_cutoff = 4,
n_Da = 4,
n_rt_sd = 1.5)
# Create metadata ---------------
# Summary at Gene/pcGroup combo level
# NOTE: The number of distinct Gene_pcGroups can be different between the x_meta
# object and the fused object. This happens because there are some Gene_clusters
# that are present in the identified data but do not exist in the unidentified
# data.
# The object created here is not used in this script. This object will become
# part of the MSnSet created in another script. This object is also not needed
# for any of the functions that manipulate the feature data.
x_meta <- create_mdata(x = x_grp,
errors = the_error,
n_mme_sd = 4/the_error$ppm_sd,
n_rt_sd = 150/the_error$rt_sd
# ppm_cutoff = 4,
# rt_cutoff = 150
)
# Maybe this where the first break supposed to be.
# Then we'll split the pipeline into two: intensity and spectral counts-based.
# What do I need to save/retain?
# SPC: x_grp, x_meta
# INT: feat, the_model, the_error
save(x_grp, x_meta, file="./output_data/pre_identifications.RData")
save(feat, the_model, the_error, file="./output_data/pre_features.RData")