-
Notifications
You must be signed in to change notification settings - Fork 0
/
biblio.bib
547 lines (512 loc) · 29.4 KB
/
biblio.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
% 32321741
@article{pride2022,
author = {Perez-Riverol, Yasset and Bai, Jingwen and Bandla, Chakradhar and García-Seisdedos, David and Hewapathirana, Suresh and Kamatchinathan, Selvakumar and Kundu, Deepti J and Prakash, Ananth and Frericks-Zipper, Anika and Eisenacher, Martin and Walzer, Mathias and Wang, Shengbo and Brazma, Alvis and Vizcaíno, Juan Antonio},
title = "{The PRIDE database resources in 2022: a hub for mass spectrometry-based proteomics evidences}",
journal = {Nucleic Acids Research},
volume = {50},
number = {D1},
pages = {D543-D552},
year = {2021},
month = {11},
abstract = "{The PRoteomics IDEntifications (PRIDE) database (https://www.ebi.ac.uk/pride/) is the world's largest data repository of mass spectrometry-based proteomics data. PRIDE is one of the founding members of the global ProteomeXchange (PX) consortium and an ELIXIR core data resource. In this manuscript, we summarize the developments in PRIDE resources and related tools since the previous update manuscript was published in Nucleic Acids Research in 2019. The number of submitted datasets to PRIDE Archive (the archival component of PRIDE) has reached on average around 500 datasets per month during 2021. In addition to continuous improvements in PRIDE Archive data pipelines and infrastructure, the PRIDE Spectra Archive has been developed to provide direct access to the submitted mass spectra using Universal Spectrum Identifiers. As a key point, the file format MAGE-TAB for proteomics has been developed to enable the improvement of sample metadata annotation. Additionally, the resource PRIDE Peptidome provides access to aggregated peptide/protein evidences across PRIDE Archive. Furthermore, we will describe how PRIDE has increased its efforts to reuse and disseminate high-quality proteomics data into other added-value resources such as UniProt, Ensembl and Expression Atlas.}",
issn = {0305-1048},
doi = {10.1093/nar/gkab1038},
url = {https://doi.org/10.1093/nar/gkab1038},
eprint = {https://academic.oup.com/nar/article-pdf/50/D1/D543/42057564/gkab1038.pdf},
}
@ARTICLE{Wilhelm2021,
title = "Deep learning boosts sensitivity of mass spectrometry-based
immunopeptidomics",
author = "Wilhelm, Mathias and Zolg, Daniel P and Graber, Michael and
Gessulat, Siegfried and Schmidt, Tobias and Schnatbaum, Karsten
and Schwencke-Westphal, Celina and Seifert, Philipp and de
Andrade Kr{\"a}tzig, Niklas and Zerweck, Johannes and Knaute,
Tobias and Br{\"a}unlein, Eva and Samaras, Patroklos and
Lautenbacher, Ludwig and Klaeger, Susan and Wenschuh, Holger and
Rad, Roland and Delanghe, Bernard and Huhmer, Andreas and Carr,
Steven A and Clauser, Karl R and Krackhardt, Angela M and Reimer,
Ulf and Kuster, Bernhard",
abstract = "Characterizing the human leukocyte antigen (HLA) bound ligandome
by mass spectrometry (MS) holds great promise for developing
vaccines and drugs for immune-oncology. Still, the identification
of non-tryptic peptides presents substantial computational
challenges. To address these, we synthesized and analyzed
>300,000 peptides by multi-modal LC-MS/MS within the
ProteomeTools project representing HLA class I \& II ligands and
products of the proteases AspN and LysN. The resulting data
enabled training of a single model using the deep learning
framework Prosit, allowing the accurate prediction of fragment
ion spectra for tryptic and non-tryptic peptides. Applying Prosit
demonstrates that the identification of HLA peptides can be
improved up to 7-fold, that 87\% of the proposed proteasomally
spliced HLA peptides may be incorrect and that dozens of
additional immunogenic neo-epitopes can be identified from
patient tumors in published data. Together, the provided
peptides, spectra and computational tools substantially expand
the analytical depth of immunopeptidomics workflows.",
journal = "Nature Communications",
volume = 12,
number = 1,
pages = "3346",
month = jun,
year = 2021,
language = "en"
}
@ARTICLE{Van_Puyvelde2022,
title = "A comprehensive {LFQ} benchmark dataset on modern day acquisition
strategies in proteomics",
author = "Van Puyvelde, Bart and Daled, Simon and Willems, Sander and
Gabriels, Ralf and Gonzalez de Peredo, Anne and Chaoui, Karima
and Mouton-Barbosa, Emmanuelle and Bouyssi{\'e}, David and
Boonen, Kurt and Hughes, Christopher J and Gethings, Lee A and
Perez-Riverol, Yasset and Bloomfield, Nic and Tate, Stephen and
Schiltz, Odile and Martens, Lennart and Deforce, Dieter and
Dhaenens, Maarten",
abstract = "In the last decade, a revolution in liquid chromatography-mass
spectrometry (LC-MS) based proteomics was unfolded with the
introduction of dozens of novel instruments that incorporate
additional data dimensions through innovative acquisition
methodologies, in turn inspiring specialized data analysis
pipelines. Simultaneously, a growing number of proteomics
datasets have been made publicly available through data
repositories such as ProteomeXchange, Zenodo and Skyline
Panorama. However, developing algorithms to mine this data and
assessing the performance on different platforms is currently
hampered by the lack of a single benchmark experimental design.
Therefore, we acquired a hybrid proteome mixture on different
instrument platforms and in all currently available families of
data acquisition. Here, we present a comprehensive Data-Dependent
and Data-Independent Acquisition (DDA/DIA) dataset acquired using
several of the most commonly used current day instrumental
platforms. The dataset consists of over 700 LC-MS runs, including
adequate replicates allowing robust statistics and covering over
nearly 10 different data formats, including scanning quadrupole
and ion mobility enabled acquisitions. Datasets are available via
ProteomeXchange (PXD028735).",
journal = "Sci Data",
volume = 9,
number = 1,
pages = "126",
month = mar,
year = 2022,
language = "en"
}
@ARTICLE{Vaudel2012,
title = "A complex standard for protein identification, designed by
evolution",
author = "Vaudel, Marc and Burkhart, Julia M and Breiter, Daniela and
Zahedi, Ren{\'e} P and Sickmann, Albert and Martens, Lennart",
abstract = "Shotgun proteomic investigations rely on the algorithmic
assignment of mass spectra to peptides. The quality of these
matches is therefore a cornerstone in the analysis and has been
the subject of numerous recent developments. In order to
establish the benefits of novel algorithms, they are applied to
reference samples of known content. However, these were recently
shown to be either too simple to resemble typical real-life
samples or as leading to results of lower accuracy as the method
itself. Here, we describe how to use the proteome of Pyrococcus
furiosus , a hyperthermophile, as a standard to evaluate
proteomics identification workflows. Indeed, we prove that the
Pyrococcus furiosus proteome provides a valid method for
detecting random hits, comparable to the decoy databases
currently in popular use, but we also prove that the Pyrococcus
furiosus proteome goes squarely beyond the decoy approach by also
providing many hundreds of highly reliable true positive hits.
Searching the Pyrococcus furiosus proteome can thus be used as a
unique test that provides the ability to reliably detect both
false positives as well as proteome-scale true positives,
allowing the rigorous testing of identification algorithms at the
peptide and protein level.",
journal = "Journal Proteome Research",
volume = 11,
number = 10,
pages = "5065--5071",
month = oct,
year = 2012,
language = "en"
}
@ARTICLE{Barsnes2018,
title = "{SearchGUI}: A Highly Adaptable Common Interface for Proteomics
Search and de Novo Engines",
author = "Barsnes, Harald and Vaudel, Marc",
abstract = "Mass-spectrometry-based proteomics has become the standard
approach for identifying and quantifying proteins. A vital step
consists of analyzing experimentally generated mass spectra to
identify the underlying peptide sequences for later mapping to
the originating proteins. We here present the latest developments
in SearchGUI, a common open-source interface for the most
frequently used freely available proteomics search and de novo
engines that has evolved into a central component in numerous
bioinformatics workflows.",
journal = "Journal of Proteome Research",
volume = 17,
number = 7,
pages = "2552--2555",
month = jul,
year = 2018,
keywords = "bioinformatics; de novo algorithms; protein identification;
search engines",
language = "en"
}
@ARTICLE{Kim2014,
title = "{MS-GF+} makes progress towards a universal database search tool
for proteomics",
author = "Kim, Sangtae and Pevzner, Pavel A",
abstract = "Mass spectrometry (MS) instruments and experimental protocols are
rapidly advancing, but the software tools to analyse tandem mass
spectra are lagging behind. We present a database search tool
MS-GF+ that is sensitive (it identifies more peptides than most
other database search tools) and universal (it works well for
diverse types of spectra, different configurations of MS
instruments and different experimental protocols). We benchmark
MS-GF+ using diverse spectral data sets: (i) spectra of varying
fragmentation methods; (ii) spectra of multiple enzyme digests;
(iii) spectra of phosphorylated peptides; and (iv) spectra of
peptides with unusual fragmentation propensities produced by a
novel alpha-lytic protease. For all these data sets, MS-GF+
significantly increases the number of identified peptides
compared with commonly used methods for peptide identifications.
We emphasize that although MS-GF+ is not specifically designed
for any particular experimental set-up, it improves on the
performance of tools specifically designed for these applications
(for example, specialized tools for phosphoproteomics).",
journal = "Nature Communications",
volume = 5,
pages = "5277",
month = oct,
year = 2014,
language = "en"
}
@ARTICLE{Fenyo2003,
title = "A method for assessing the statistical significance of mass
spectrometry-based protein identifications using general scoring
schemes",
author = "Feny{\"o}, David and Beavis, Ronald C",
abstract = "This paper investigates the use of survival functions and
expectation values to evaluate the results of protein
identification experiments. These functions are standard
statistical measures that can be used to reduce various protein
identification scoring schemes to a common, easily interpretably
representation. The relative merits of scoring systems were
explored using this approach, as well as the effects of altering
primary identification parameters. We would advocate the
widespread use of these simple statistical measures to simplify
and standardize the reporting of the confidence of protein
identification results, allowing the users of different
identification algorithms to compare their results in a
straightforward and statistically significant manner. A method is
described for measuring these distributions using information
that is being discarded by most protein identification search
engines, resulting in accurate survival functions that are
specific to any combination of scoring algorithms, sequence
databases, and mass spectra.",
journal = "Analytical Chemistry",
volume = 75,
number = 4,
pages = "768--774",
month = feb,
year = 2003,
language = "en"
}
@ARTICLE{Hulstaert2020,
title = "{ThermoRawFileParser}: Modular, Scalable, and {Cross-Platform}
{RAW} File Conversion",
author = "Hulstaert, Niels and Shofstahl, Jim and Sachsenberg, Timo and
Walzer, Mathias and Barsnes, Harald and Martens, Lennart and
Perez-Riverol, Yasset",
abstract = "The field of computational proteomics is approaching the big data
age, driven both by a continuous growth in the number of samples
analyzed per experiment as well as by the growing amount of data
obtained in each analytical run. In order to process these large
amounts of data, it is increasingly necessary to use elastic
compute resources such as Linux-based cluster environments and
cloud infrastructures. Unfortunately, the vast majority of
cross-platform proteomics tools are not able to operate directly
on the proprietary formats generated by the diverse mass
spectrometers. Here, we present ThermoRawFileParser, an
open-source, cross-platform tool that converts Thermo RAW files
into open file formats such as MGF and the HUPO-PSI standard file
format mzML. To ensure the broadest possible availability and to
increase integration capabilities with popular workflow systems
such as Galaxy or Nextflow, we have also built Conda package and
BioContainers container around ThermoRawFileParser. In addition,
we implemented a user-friendly interface (ThermoRawFileParserGUI)
for those users not familiar with command-line tools. Finally, we
performed a benchmark of ThermoRawFileParser and msconvert to
verify that the converted mzML files contain reliable
quantitative results.",
journal = "Journal of Proteome Research",
volume = 19,
number = 1,
pages = "537--542",
month = jan,
year = 2020,
keywords = "big data; bioinformatics; cloud; file formats; mass spectrometry;
metadata; mzML; open source; software; workflows",
language = "en"
}
@ARTICLE{Di_Tommaso2017,
title = "Nextflow enables reproducible computational workflows",
author = "Di Tommaso, Paolo and Chatzou, Maria and Floden, Evan W and
Barja, Pablo Prieto and Palumbo, Emilio and Notredame, Cedric",
journal = "Nature Biotechnology",
volume = 35,
number = 4,
pages = "316--319",
month = apr,
year = 2017,
language = "en"
}
@ARTICLE{Da_Veiga_Leprevost2017,
title = "{BioContainers}: an open-source and community-driven framework
for software standardization",
author = "da Veiga Leprevost, Felipe and Gr{\"u}ning, Bj{\"o}rn A and Alves
Aflitos, Saulo and R{\"o}st, Hannes L and Uszkoreit, Julian and
Barsnes, Harald and Vaudel, Marc and Moreno, Pablo and Gatto,
Laurent and Weber, Jonas and Bai, Mingze and Jimenez, Rafael C
and Sachsenberg, Timo and Pfeuffer, Julianus and Vera Alvarez,
Roberto and Griss, Johannes and Nesvizhskii, Alexey I and
Perez-Riverol, Yasset",
abstract = "MOTIVATION: BioContainers (biocontainers.pro) is an open-source
and community-driven framework which provides platform
independent executable environments for bioinformatics software.
BioContainers allows labs of all sizes to easily install
bioinformatics software, maintain multiple versions of the same
software and combine tools into powerful analysis pipelines.
BioContainers is based on popular open-source projects Docker and
rkt frameworks, that allow software to be installed and executed
under an isolated and controlled environment. Also, it provides
infrastructure and basic guidelines to create, manage and
distribute bioinformatics containers with a special focus on
omics technologies. These containers can be integrated into more
comprehensive bioinformatics pipelines and different
architectures (local desktop, cloud environments or HPC
clusters). AVAILABILITY AND IMPLEMENTATION: The software is
freely available at github.com/BioContainers/. CONTACT:
yperez@ebi.ac.uk.",
journal = "Bioinformatics",
volume = 33,
number = 16,
pages = "2580--2582",
month = aug,
year = 2017,
language = "en"
}
@ARTICLE{Gabriels2022,
title = "psm\_utils: A high level Python {API} for parsing and handling
peptide-spectrum-matches and proteomics search results",
author = "Gabriels, Ralf and Declercq, Arthur and Bouwmeester, Robbin and
Degroeve, Sven and Martens, Lennart",
abstract = "A plethora of proteomics search engine output file formats are in
circulation. This lack of standardized output files greatly
complicates generic downstream processing of peptide-spectrum
matches (PSMs) and PSM files. While standards exist to solve this
problem, these are far from universally supported by search
engines. Moreover, software libraries are available to read a
selection of PSM file formats, but a light-weight package to
parse PSM files into a unified data structure has been missing.
Here, we present psm\_utils, a Python package to read and write
various PSM file formats and to handle peptidoforms, PSMs, and
PSM lists in a unified and user-friendly Python-, command line-,
and web-interface. psm\_utils was developed with pragmatism and
maintainability in mind, adhering to community standards and
relying on existing packages where possible. The Python API and
command line interface greatly facilitate handling various PSM
file formats. Moreover, a user-friendly web application was built
using psm\_utils that allows anyone to interconvert PSM files and
retrieve basic PSM statistics. psm\_utils is freely available
under the permissive Apache2 license at
https://github.com/compomics/psm\_utils.",
journal = "ChemRxiv",
month = oct,
year = 2022,
keywords = "Proteomics;Bioinformatics;Data analysis;Peptide
identification;Peptide-spectrum matches",
language = "en"
}
@ARTICLE{Verheggen2020,
title = "Anatomy and evolution of database search engines-a central
component of mass spectrometry based proteomic workflows",
author = "Verheggen, Kenneth and Raeder, Helge and Berven, Frode S and
Martens, Lennart and Barsnes, Harald and Vaudel, Marc",
abstract = "Sequence database search engines are bioinformatics algorithms
that identify peptides from tandem mass spectra using a reference
protein sequence database. Two decades of development, notably
driven by advances in mass spectrometry, have provided scientists
with more than 30 published search engines, each with its own
properties. In this review, we present the common paradigm behind
the different implementations, and its limitations for modern
mass spectrometry datasets. We also detail how the search engines
attempt to alleviate these limitations, and provide an overview
of the different software frameworks available to the researcher.
Finally, we highlight alternative approaches for the
identification of proteomic mass spectrometry datasets, either as
a replacement for, or as a complement to, sequence database
search engines.",
journal = "Mass Spectrometry Reviews",
volume = 39,
number = 3,
pages = "292--306",
month = may,
year = 2020,
keywords = "bioinformatics; proteomics; search engine",
language = "en"
}
@ARTICLE{Elias2010,
title = "Target-decoy search strategy for mass spectrometry-based
proteomics",
author = "Elias, Joshua E and Gygi, Steven P",
abstract = "Accurate and precise methods for estimating incorrect peptide and
protein identifications are crucial for effective large-scale
proteome analyses by tandem mass spectrometry. The target-decoy
search strategy has emerged as a simple, effective tool for
generating such estimations. This strategy is based on the
premise that obvious, necessarily incorrect ``decoy'' sequences
added to the search space will correspond with incorrect search
results that might otherwise be deemed to be correct. With this
knowledge, it is possible not only to estimate how many incorrect
results are in a final data set but also to use decoy hits to
guide the design of filtering criteria that sensitively partition
a data set into correct and incorrect identifications.",
journal = " Methods in Molecular Biology",
volume = 604,
pages = "55--71",
year = 2010,
language = "en"
}
@ARTICLE{Gupta2011,
title = "Target-decoy approach and false discovery rate: when things may
go wrong",
author = "Gupta, Nitin and Bandeira, Nuno and Keich, Uri and Pevzner, Pavel
A",
abstract = "The target-decoy approach (TDA) has done the field of proteomics
a great service by filling in the need to estimate the false
discovery rates (FDR) of peptide identifications. While TDA is
often viewed as a universal solution to the problem of FDR
evaluation, we argue that the time has come to critically
re-examine TDA and to acknowledge not only its merits but also
its demerits. We demonstrate that some popular MS/MS search tools
are not TDA-compliant and that it is easy to develop a non-TDA
compliant tool that outperforms all TDA-compliant tools. Since
the distinction between TDA-compliant and non-TDA compliant tools
remains elusive, we are concerned about a possible proliferation
of non-TDA-compliant tools in the future (developed with the best
intentions). We are also concerned that estimation of the FDR by
TDA awkwardly depends on a virtual coin toss and argue that it is
important to take the coin toss factor out of our estimation of
the FDR. Since computing FDR via TDA suffers from various
restrictions, we argue that TDA is not needed when accurate
p-values of individual Peptide-Spectrum Matches are available.",
journal = "Journal of the American Society for Mass Spectrometry",
volume = 22,
number = 7,
pages = "1111--1120",
month = jul,
year = 2011,
language = "en"
}
@ARTICLE{Gatto2012,
title = "{MSnbase-an} {R/Bioconductor} package for isobaric tagged mass
spectrometry data visualization, processing and quantitation",
author = "Gatto, Laurent and Lilley, Kathryn S",
abstract = "UNLABELLED: MSnbase is an R/Bioconductor package for the analysis
of quantitative proteomics experiments that use isobaric tagging.
It provides an exploratory data analysis framework for
reproducible research, allowing raw data import, quality control,
visualization, data processing and quantitation. MSnbase allows
direct integration of quantitative proteomics data with
additional facilities for statistical analysis provided by the
Bioconductor project. AVAILABILITY: MSnbase is implemented in R
(version $\geq$ 2.13.0) and available at the Bioconductor web
site (http://www.bioconductor.org/). Vignettes outlining typical
workflows, input/output capabilities and detailing underlying
infrastructure are included in the package.",
journal = "Bioinformatics",
volume = 28,
number = 2,
pages = "288--289",
month = jan,
year = 2012,
language = "en"
}
@Manual{shiny2021,
title = {shiny: Web Application Framework for R},
author = {Winston Chang and Joe Cheng and JJ Allaire and Carson Sievert and Barret Schloerke and Yihui Xie and Jeff Allen and Jonathan McPherson and Alan Dipert and Barbara Borges},
year = {2021},
note = {R package version 1.7.1},
url = {https://CRAN.R-project.org/package=shiny},
}
@ARTICLE{Gonnelli2015,
title = "A decoy-free approach to the identification of peptides",
author = "Gonnelli, Giulia and Stock, Michiel and Verwaeren, Jan and
Maddelein, Davy and De Baets, Bernard and Martens, Lennart and
Degroeve, Sven",
abstract = "A growing number of proteogenomics and metaproteomics studies
indicate potential limitations of the application of the
``decoy'' database paradigm used to separate correct peptide
identifications from incorrect ones in traditional shotgun
proteomics. We therefore propose a binary classifier called Nokoi
that allows fast yet reliable decoy-free separation of correct
from incorrect peptide-to-spectrum matches (PSMs). Nokoi was
trained on a very large collection of heterogeneous data using
ranks supplied by the Mascot search engine to label correct and
incorrect PSMs. We show that Nokoi outperforms Mascot and
achieves a performance very close to that of Percolator at
substantially higher processing speeds.",
journal = "Journal of Proteome Research",
volume = 14,
number = 4,
pages = "1792--1798",
month = apr,
year = 2015,
keywords = "decoy databases; machine learning; peptide identification",
language = "en"
}
@article{Efron2008,
author = {Bradley Efron},
title = {{Microarrays, Empirical Bayes and the Two-Groups Model}},
volume = {23},
journal = {Statistical Science},
number = {1},
publisher = {Institute of Mathematical Statistics},
pages = {1 -- 22},
keywords = {empirical null, False discovery rates, simultaneous tests},
year = {2008},
doi = {10.1214/07-STS236},
URL = {https://doi.org/10.1214/07-STS236}
}
@ARTICLE{Spivak2009,
title = "Improvements to the percolator algorithm for Peptide
identification from shotgun proteomics data sets",
author = "Spivak, Marina and Weston, Jason and Bottou, L{\'e}on and
K{\"a}ll, Lukas and Noble, William Stafford",
abstract = "Shotgun proteomics coupled with database search software allows
the identification of a large number of peptides in a single
experiment. However, some existing search algorithms, such as
SEQUEST, use score functions that are designed primarily to
identify the best peptide for a given spectrum. Consequently,
when comparing identifications across spectra, the SEQUEST score
function Xcorr fails to discriminate accurately between correct
and incorrect peptide identifications. Several machine learning
methods have been proposed to address the resulting
classification task of distinguishing between correct and
incorrect peptide-spectrum matches (PSMs). A recent example is
Percolator, which uses semisupervised learning and a decoy
database search strategy to learn to distinguish between correct
and incorrect PSMs identified by a database search algorithm. The
current work describes three improvements to Percolator. (1)
Percolator's heuristic optimization is replaced with a clear
objective function, with intuitive reasons behind its choice. (2)
Tractable nonlinear models are used instead of linear models,
leading to improved accuracy over the original Percolator. (3) A
method, Q-ranker, for directly optimizing the number of
identified spectra at a specified q value is proposed, which
achieves further gains.",
journal = "Journal of Proteome Research",
volume = 8,
number = 7,
pages = "3737--3745",
month = jul,
year = 2009,
language = "en"
}