Skip to content

Commit

Permalink
Merge pull request #186 from compomics/feature/direct-flashlfq-output
Browse files Browse the repository at this point in the history
Switch to FlashLFQ output from psm_utils
  • Loading branch information
RalfG authored Sep 19, 2024
2 parents b1e23f3 + b5f3d78 commit 3950020
Show file tree
Hide file tree
Showing 9 changed files with 46 additions and 58 deletions.
2 changes: 1 addition & 1 deletion docs/source/config_schema.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@
- **One of**
- *string*
- *null*
- **`write_flashlfq`** *(boolean)*: Write results to a FlashLFQ-compatible file. Default: `false`.
- **`write_report`** *(boolean)*: Write an HTML report with various QC metrics and charts. Default: `false`.
- **`profile`** *(boolean)*: Write a txt report using cProfile for profiling. Default: `false`.
## Definitions
Expand All @@ -93,7 +94,6 @@
- **`train_fdr`** *(number)*: FDR threshold for training Mokapot. Minimum: `0`. Maximum: `1`. Default: `0.01`.
- **`write_weights`** *(boolean)*: Write Mokapot weights to a text file. Default: `false`.
- **`write_txt`** *(boolean)*: Write Mokapot results to a text file. Default: `false`.
- **`write_flashlfq`** *(boolean)*: Write Mokapot results to a FlashLFQ-compatible file. Default: `false`.
- <a id="definitions/percolator"></a>**`percolator`** *(object)*: Percolator rescoring engine configuration. Can contain additional properties. Refer to *[#/definitions/rescoring_engine](#definitions/rescoring_engine)*.
- **`init-weights`**: Weights file for scoring function. Default: `false`.
- **One of**
Expand Down
3 changes: 0 additions & 3 deletions examples/msgfplus-ms2rescore.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,6 @@
},
"log_level": "debug",
"processes": 16,
"feature_generators": {
"basic": {}
},
"rescoring_engine": {
"mokapot": {
"fasta_file": "examples/proteins/uniprot-proteome-human-contaminants.fasta",
Expand Down
18 changes: 0 additions & 18 deletions examples/msgfplus-ms2rescore.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,25 +5,7 @@ psm_reader_kwargs = { "score_column" = "PSMScore" }
log_level = "debug"
processes = 16

# [ms2rescore.modification_mapping]

# [ms2rescore.fixed_modifications]

[ms2rescore.feature_generators.basic]
# No options, but setting heading enables feature generator

# [ms2rescore.feature_generators.ms2pip]
# model = "HCD"
# ms2_tolerance = 0.02

# [ms2rescore.feature_generators.deeplc]
# deeplc_retrain = false

# [ms2rescore.feature_generators.maxquant]
# No options, but setting heading enables feature generator

[ms2rescore.rescoring_engine.mokapot]
fasta_file = "examples/proteins/uniprot-proteome-human-contaminants.fasta"
write_weights = true
write_txt = true
# write_flashlfq = true
10 changes: 10 additions & 0 deletions ms2rescore/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,16 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
logger.info(f"Writing output to {output_file_root}.psms.tsv...")
psm_utils.io.write_file(psm_list, output_file_root + ".psms.tsv", filetype="tsv")

if config["write_flashlfq"]:
logger.info(f"Writing output to {output_file_root}.flashlfq.tsv...")
psm_utils.io.write_file(
psm_list,
output_file_root + ".flashlfq.tsv",
filetype="flashlfq",
fdr_threshold=0.01,
only_target=True, # TODO: Make FDR threshold configurable
)

# Write report
if config["write_report"]:
try:
Expand Down
33 changes: 19 additions & 14 deletions ms2rescore/gui/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,17 @@ def __init__(self, *args, **kwargs):
)
self.usi.grid(row=1, column=0, pady=(0, 10), sticky="nsew")

self.write_flashlfq = widgets.LabeledSwitch(
self,
label="Write FlashLFQ input file",
description=(
"Write a file that can be used as input for FlashLFQ. This file only contains "
"target PSMs that pass the FDR threshold."
),
wraplength=CONFIG_WIDTH - 180,
)
self.write_flashlfq.grid(row=2, column=0, pady=(0, 10), sticky="nsew")

self.generate_report = widgets.LabeledSwitch(
self,
label="Generate interactive report",
Expand All @@ -380,7 +391,7 @@ def __init__(self, *args, **kwargs):
wraplength=CONFIG_WIDTH - 180,
default=True,
)
self.generate_report.grid(row=2, column=0, pady=(0, 10), sticky="nsew")
self.generate_report.grid(row=3, column=0, pady=(0, 10), sticky="nsew")

self.id_decoy_pattern = widgets.LabeledEntry(
self,
Expand All @@ -392,7 +403,7 @@ def __init__(self, *args, **kwargs):
),
wraplength=CONFIG_WIDTH - 180,
)
self.id_decoy_pattern.grid(row=3, column=0, pady=(0, 10), sticky="nsew")
self.id_decoy_pattern.grid(row=4, column=0, pady=(0, 10), sticky="nsew")

self.psm_id_pattern = widgets.LabeledEntry(
self,
Expand All @@ -404,7 +415,7 @@ def __init__(self, *args, **kwargs):
),
wraplength=CONFIG_WIDTH - 180,
)
self.psm_id_pattern.grid(row=4, column=0, pady=(0, 10), sticky="nsew")
self.psm_id_pattern.grid(row=5, column=0, pady=(0, 10), sticky="nsew")

self.spectrum_id_pattern = widgets.LabeledEntry(
self,
Expand All @@ -414,7 +425,7 @@ def __init__(self, *args, **kwargs):
),
wraplength=CONFIG_WIDTH - 180,
)
self.spectrum_id_pattern.grid(row=5, column=0, pady=(0, 10), sticky="nsew")
self.spectrum_id_pattern.grid(row=6, column=0, pady=(0, 10), sticky="nsew")

self.processes = widgets.LabeledOptionMenu(
self,
Expand All @@ -428,7 +439,7 @@ def __init__(self, *args, **kwargs):
values=[str(x) for x in list(range(1, min(16, multiprocessing.cpu_count()) + 1))],
default_value=str(min(16, multiprocessing.cpu_count())),
)
self.processes.grid(row=6, column=0, pady=(0, 10), sticky="nsew")
self.processes.grid(row=7, column=0, pady=(0, 10), sticky="nsew")

self.file_prefix = widgets.LabeledFileSelect(
self,
Expand All @@ -441,7 +452,7 @@ def __init__(self, *args, **kwargs):
),
wraplength=CONFIG_WIDTH - 20,
)
self.file_prefix.grid(row=7, column=0, columnspan=2, sticky="nsew")
self.file_prefix.grid(row=8, column=0, columnspan=2, sticky="nsew")

self.config_file = widgets.LabeledFileSelect(
self,
Expand All @@ -453,13 +464,14 @@ def __init__(self, *args, **kwargs):
),
wraplength=CONFIG_WIDTH - 20,
)
self.config_file.grid(row=8, column=0, columnspan=2, sticky="nsew")
self.config_file.grid(row=9, column=0, columnspan=2, sticky="nsew")

def get(self) -> Dict:
"""Get the configured values as a dictionary."""
return {
"lower_score_is_better": bool(int(self.lower_score.get())), # str repr of 0 or 1
"rename_to_usi": self.usi.get(),
"write_flashlfq": self.write_flashlfq.get(),
"write_report": self.generate_report.get(),
"id_decoy_pattern": self.id_decoy_pattern.get(),
"psm_id_pattern": self.psm_id_pattern.get(),
Expand Down Expand Up @@ -732,12 +744,6 @@ def __init__(self, *args, **kwargs):
self.write_txt.grid(row=row_n, column=0, pady=(0, 10), sticky="nsew")
row_n += 1

self.write_flashlfq = widgets.LabeledSwitch(
self, label="Write file for FlashLFQ", default=False
)
self.write_flashlfq.grid(row=row_n, column=0, pady=(0, 10), sticky="nsew")
row_n += 1

self.fasta_file = widgets.LabeledFileSelect(
self,
label="Select FASTA file (optional, required for protein inference)",
Expand All @@ -760,7 +766,6 @@ def get(self) -> Dict:
config = {
"write_weights": self.write_weights.get(),
"write_txt": self.write_txt.get(),
"write_flashlfq": self.write_flashlfq.get(),
"fasta_file": self.fasta_file.get(),
"protein_kwargs": self._parse_protein_kwargs(self.protein_kwargs.get()),
}
Expand Down
4 changes: 2 additions & 2 deletions ms2rescore/package_data/config_default.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@
"mokapot": {
"train_fdr": 0.01,
"write_weights": true,
"write_txt": true,
"write_flashlfq": true
"write_txt": true
}
},
"config_file": null,
Expand All @@ -40,6 +39,7 @@
"processes": -1,
"rename_to_usi": false,
"fasta_file": null,
"write_flashlfq": false,
"write_report": false
}
}
3 changes: 1 addition & 2 deletions ms2rescore/package_data/config_default_tims.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@
"rescoring_engine": {
"mokapot": {
"write_weights": true,
"write_txt": true,
"write_flashlfq": true
"write_txt": true
}
},
"psm_file": null
Expand Down
10 changes: 5 additions & 5 deletions ms2rescore/package_data/config_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,11 @@
"description": "Path to FASTA file with protein sequences to use for protein inference",
"oneOf": [{ "type": "string" }, { "type": "null" }]
},
"write_flashlfq": {
"description": "Write results to a FlashLFQ-compatible file",
"type": "boolean",
"default": false
},
"write_report": {
"description": "Write an HTML report with various QC metrics and charts",
"type": "boolean",
Expand Down Expand Up @@ -295,11 +300,6 @@
"description": "Write Mokapot results to a text file",
"type": "boolean",
"default": false
},
"write_flashlfq": {
"description": "Write Mokapot results to a FlashLFQ-compatible file",
"type": "boolean",
"default": false
}
}
},
Expand Down
21 changes: 8 additions & 13 deletions ms2rescore/rescoring_engines/mokapot.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ def rescore(
train_fdr: float = 0.01,
write_weights: bool = False,
write_txt: bool = False,
write_flashlfq: bool = False,
protein_kwargs: Optional[Dict[str, Any]] = None,
**kwargs: Any,
) -> None:
Expand All @@ -57,8 +56,7 @@ def rescore(
:py:class:`~mokapot.dataset.LinearPsmDataset`, and then optionally adds protein information
from a FASTA file. The dataset is then passed to the :py:func:`~mokapot.brew` function, which
returns the new scores, q-values, and PEPs. These are then written back to the original
:py:class:`~psm_utils.psm_list.PSMList`. Optionally, results can be written to a Mokapot text
file, a FlashLFQ-compatible file, or the model weights can be saved.
:py:class:`~psm_utils.psm_list.PSMList`.
Parameters
----------
Expand All @@ -75,8 +73,6 @@ def rescore(
Write model weights to a text file. Defaults to ``False``.
write_txt
Write Mokapot results to a text file. Defaults to ``False``.
write_flashlfq
Write Mokapot results to a FlashLFQ-compatible file. Defaults to ``False``.
protein_kwargs
Keyword arguments to pass to the :py:meth:`~mokapot.dataset.LinearPsmDataset.add_proteins`
method.
Expand All @@ -86,6 +82,13 @@ def rescore(
"""
_set_log_levels()

if "write_flashlfq" in kwargs:
_ = kwargs.pop("write_flashlfq")
logger.warning(
"The `write_flashlfq` argument has moved. To write FlashLFQ generic TSV, use the "
"MS²Rescore-level `write_flashlfq` option instead."
)

# Convert PSMList to Mokapot dataset
lin_psm_data = convert_psm_list(psm_list)
feature_names = list(lin_psm_data.features.columns)
Expand Down Expand Up @@ -119,10 +122,6 @@ def rescore(
)
if write_txt:
confidence_results.to_txt(file_root=output_file_root, decoys=True)
if write_flashlfq:
# TODO: How do we validate that the RTs are in minutes?
confidence_results.psms["retention_time"] = confidence_results.psms["retention_time"] * 60
confidence_results.to_flashlfq(output_file_root + ".mokapot.flashlfq.txt")


def convert_psm_list(
Expand Down Expand Up @@ -167,10 +166,6 @@ def convert_psm_list(
feature_df.columns = [f"feature:{f}" for f in feature_df.columns]
combined_df = pd.concat([psm_df[required_columns], feature_df], axis=1)

# Ensure filename for FlashLFQ txt output
if not combined_df["run"].notnull().all():
combined_df["run"] = "na"

feature_names = [f"feature:{f}" for f in feature_names] if feature_names else None

lin_psm_data = LinearPsmDataset(
Expand Down

0 comments on commit 3950020

Please sign in to comment.