From ba6208687fa39bfe2495b6e613b3b3ef354bd8ec Mon Sep 17 00:00:00 2001 From: RalfG Date: Thu, 5 Sep 2024 11:06:19 +0200 Subject: [PATCH 1/4] Switch to psm_utils flashlfq writing instead of Mokapot function. This removes the ms2rescore>mokapot>write_flashlfq option and adds the ms2rescore>write_flashlfq option. --- docs/source/config_schema.md | 2 +- ms2rescore/core.py | 10 ++++++++++ ms2rescore/package_data/config_default.json | 4 ++-- ms2rescore/package_data/config_schema.json | 10 +++++----- ms2rescore/rescoring_engines/mokapot.py | 17 ++++++++--------- pyproject.toml | 2 +- 6 files changed, 27 insertions(+), 18 deletions(-) diff --git a/docs/source/config_schema.md b/docs/source/config_schema.md index 209bc00..5d8422b 100644 --- a/docs/source/config_schema.md +++ b/docs/source/config_schema.md @@ -67,6 +67,7 @@ - **One of** - *string* - *null* + - **`write_flashlfq`** *(boolean)*: Write results to a FlashLFQ-compatible file. Default: `false`. - **`write_report`** *(boolean)*: Write an HTML report with various QC metrics and charts. Default: `false`. - **`profile`** *(boolean)*: Write a txt report using cProfile for profiling. Default: `false`. ## Definitions @@ -93,7 +94,6 @@ - **`train_fdr`** *(number)*: FDR threshold for training Mokapot. Minimum: `0`. Maximum: `1`. Default: `0.01`. - **`write_weights`** *(boolean)*: Write Mokapot weights to a text file. Default: `false`. - **`write_txt`** *(boolean)*: Write Mokapot results to a text file. Default: `false`. - - **`write_flashlfq`** *(boolean)*: Write Mokapot results to a FlashLFQ-compatible file. Default: `false`. - **`percolator`** *(object)*: Percolator rescoring engine configuration. Can contain additional properties. Refer to *[#/definitions/rescoring_engine](#definitions/rescoring_engine)*. - **`init-weights`**: Weights file for scoring function. Default: `false`. - **One of** diff --git a/ms2rescore/core.py b/ms2rescore/core.py index 170f103..a02ab4f 100644 --- a/ms2rescore/core.py +++ b/ms2rescore/core.py @@ -163,6 +163,16 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: logger.info(f"Writing output to {output_file_root}.psms.tsv...") psm_utils.io.write_file(psm_list, output_file_root + ".psms.tsv", filetype="tsv") + if config["write_flashlfq"]: + logger.info(f"Writing output to {output_file_root}.flashlfq.tsv...") + psm_utils.io.write_file( + psm_list, + output_file_root + ".flashlfq.tsv", + filetype="flashlfq", + fdr_threshold=0.01, + only_target=True, # TODO: Make FDR threshold configurable + ) + # Write report if config["write_report"]: try: diff --git a/ms2rescore/package_data/config_default.json b/ms2rescore/package_data/config_default.json index 733ea70..805be2c 100644 --- a/ms2rescore/package_data/config_default.json +++ b/ms2rescore/package_data/config_default.json @@ -16,8 +16,7 @@ "mokapot": { "train_fdr": 0.01, "write_weights": true, - "write_txt": true, - "write_flashlfq": true + "write_txt": true } }, "config_file": null, @@ -40,6 +39,7 @@ "processes": -1, "rename_to_usi": false, "fasta_file": null, + "write_flashlfq": false, "write_report": false } } diff --git a/ms2rescore/package_data/config_schema.json b/ms2rescore/package_data/config_schema.json index ab77f3b..e9b0379 100644 --- a/ms2rescore/package_data/config_schema.json +++ b/ms2rescore/package_data/config_schema.json @@ -169,6 +169,11 @@ "description": "Path to FASTA file with protein sequences to use for protein inference", "oneOf": [{ "type": "string" }, { "type": "null" }] }, + "write_flashlfq": { + "description": "Write results to a FlashLFQ-compatible file", + "type": "boolean", + "default": false + }, "write_report": { "description": "Write an HTML report with various QC metrics and charts", "type": "boolean", @@ -295,11 +300,6 @@ "description": "Write Mokapot results to a text file", "type": "boolean", "default": false - }, - "write_flashlfq": { - "description": "Write Mokapot results to a FlashLFQ-compatible file", - "type": "boolean", - "default": false } } }, diff --git a/ms2rescore/rescoring_engines/mokapot.py b/ms2rescore/rescoring_engines/mokapot.py index f02d877..f2d7218 100644 --- a/ms2rescore/rescoring_engines/mokapot.py +++ b/ms2rescore/rescoring_engines/mokapot.py @@ -45,7 +45,6 @@ def rescore( train_fdr: float = 0.01, write_weights: bool = False, write_txt: bool = False, - write_flashlfq: bool = False, protein_kwargs: Optional[Dict[str, Any]] = None, **kwargs: Any, ) -> None: @@ -57,8 +56,7 @@ def rescore( :py:class:`~mokapot.dataset.LinearPsmDataset`, and then optionally adds protein information from a FASTA file. The dataset is then passed to the :py:func:`~mokapot.brew` function, which returns the new scores, q-values, and PEPs. These are then written back to the original - :py:class:`~psm_utils.psm_list.PSMList`. Optionally, results can be written to a Mokapot text - file, a FlashLFQ-compatible file, or the model weights can be saved. + :py:class:`~psm_utils.psm_list.PSMList`. Parameters ---------- @@ -75,8 +73,6 @@ def rescore( Write model weights to a text file. Defaults to ``False``. write_txt Write Mokapot results to a text file. Defaults to ``False``. - write_flashlfq - Write Mokapot results to a FlashLFQ-compatible file. Defaults to ``False``. protein_kwargs Keyword arguments to pass to the :py:meth:`~mokapot.dataset.LinearPsmDataset.add_proteins` method. @@ -86,6 +82,13 @@ def rescore( """ _set_log_levels() + if "write_flashlfq" in kwargs: + _ = kwargs.pop("write_flashlfq") + logger.warning( + "The Mokapot `write_flashlfq` argument has been deprecated. To write FlashLFQ generic " + "TSV, use the MS²Rescore-level `write_flashlfq` option instead." + ) + # Convert PSMList to Mokapot dataset lin_psm_data = convert_psm_list(psm_list) feature_names = list(lin_psm_data.features.columns) @@ -119,10 +122,6 @@ def rescore( ) if write_txt: confidence_results.to_txt(file_root=output_file_root, decoys=True) - if write_flashlfq: - # TODO: How do we validate that the RTs are in minutes? - confidence_results.psms["retention_time"] = confidence_results.psms["retention_time"] * 60 - confidence_results.to_flashlfq(output_file_root + ".mokapot.flashlfq.txt") def convert_psm_list( diff --git a/pyproject.toml b/pyproject.toml index f829cce..da265fc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,7 +46,7 @@ dependencies = [ "numpy>=1.16.0", "pandas>=1.0", "plotly>=5", - "psm_utils>=0.9", + "psm_utils>=1.1", "pyteomics>=4.7.2", "rich>=12", "tomli>=2; python_version < '3.11'", From 556b6d43ab1e6ac455f46dc7fb777fa481548e11 Mon Sep 17 00:00:00 2001 From: RalfG Date: Thu, 5 Sep 2024 11:25:04 +0200 Subject: [PATCH 2/4] GUI: Move write_flashlfq option --- ms2rescore/gui/app.py | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/ms2rescore/gui/app.py b/ms2rescore/gui/app.py index cd13fa2..29eefa2 100644 --- a/ms2rescore/gui/app.py +++ b/ms2rescore/gui/app.py @@ -370,6 +370,17 @@ def __init__(self, *args, **kwargs): ) self.usi.grid(row=1, column=0, pady=(0, 10), sticky="nsew") + self.write_flashlfq = widgets.LabeledSwitch( + self, + label="Write FlashLFQ input file", + description=( + "Write a file that can be used as input for FlashLFQ. This file only contains " + "target PSMs that pass the FDR threshold." + ), + wraplength=CONFIG_WIDTH - 180, + ) + self.write_flashlfq.grid(row=2, column=0, pady=(0, 10), sticky="nsew") + self.generate_report = widgets.LabeledSwitch( self, label="Generate interactive report", @@ -380,7 +391,7 @@ def __init__(self, *args, **kwargs): wraplength=CONFIG_WIDTH - 180, default=True, ) - self.generate_report.grid(row=2, column=0, pady=(0, 10), sticky="nsew") + self.generate_report.grid(row=3, column=0, pady=(0, 10), sticky="nsew") self.id_decoy_pattern = widgets.LabeledEntry( self, @@ -392,7 +403,7 @@ def __init__(self, *args, **kwargs): ), wraplength=CONFIG_WIDTH - 180, ) - self.id_decoy_pattern.grid(row=3, column=0, pady=(0, 10), sticky="nsew") + self.id_decoy_pattern.grid(row=4, column=0, pady=(0, 10), sticky="nsew") self.psm_id_pattern = widgets.LabeledEntry( self, @@ -404,7 +415,7 @@ def __init__(self, *args, **kwargs): ), wraplength=CONFIG_WIDTH - 180, ) - self.psm_id_pattern.grid(row=4, column=0, pady=(0, 10), sticky="nsew") + self.psm_id_pattern.grid(row=5, column=0, pady=(0, 10), sticky="nsew") self.spectrum_id_pattern = widgets.LabeledEntry( self, @@ -414,7 +425,7 @@ def __init__(self, *args, **kwargs): ), wraplength=CONFIG_WIDTH - 180, ) - self.spectrum_id_pattern.grid(row=5, column=0, pady=(0, 10), sticky="nsew") + self.spectrum_id_pattern.grid(row=6, column=0, pady=(0, 10), sticky="nsew") self.processes = widgets.LabeledOptionMenu( self, @@ -428,7 +439,7 @@ def __init__(self, *args, **kwargs): values=[str(x) for x in list(range(1, min(16, multiprocessing.cpu_count()) + 1))], default_value=str(min(16, multiprocessing.cpu_count())), ) - self.processes.grid(row=6, column=0, pady=(0, 10), sticky="nsew") + self.processes.grid(row=7, column=0, pady=(0, 10), sticky="nsew") self.file_prefix = widgets.LabeledFileSelect( self, @@ -441,7 +452,7 @@ def __init__(self, *args, **kwargs): ), wraplength=CONFIG_WIDTH - 20, ) - self.file_prefix.grid(row=7, column=0, columnspan=2, sticky="nsew") + self.file_prefix.grid(row=8, column=0, columnspan=2, sticky="nsew") self.config_file = widgets.LabeledFileSelect( self, @@ -453,13 +464,14 @@ def __init__(self, *args, **kwargs): ), wraplength=CONFIG_WIDTH - 20, ) - self.config_file.grid(row=8, column=0, columnspan=2, sticky="nsew") + self.config_file.grid(row=9, column=0, columnspan=2, sticky="nsew") def get(self) -> Dict: """Get the configured values as a dictionary.""" return { "lower_score_is_better": bool(int(self.lower_score.get())), # str repr of 0 or 1 "rename_to_usi": self.usi.get(), + "write_flashlfq": self.write_flashlfq.get(), "write_report": self.generate_report.get(), "id_decoy_pattern": self.id_decoy_pattern.get(), "psm_id_pattern": self.psm_id_pattern.get(), @@ -732,12 +744,6 @@ def __init__(self, *args, **kwargs): self.write_txt.grid(row=row_n, column=0, pady=(0, 10), sticky="nsew") row_n += 1 - self.write_flashlfq = widgets.LabeledSwitch( - self, label="Write file for FlashLFQ", default=False - ) - self.write_flashlfq.grid(row=row_n, column=0, pady=(0, 10), sticky="nsew") - row_n += 1 - self.fasta_file = widgets.LabeledFileSelect( self, label="Select FASTA file (optional, required for protein inference)", @@ -760,7 +766,6 @@ def get(self) -> Dict: config = { "write_weights": self.write_weights.get(), "write_txt": self.write_txt.get(), - "write_flashlfq": self.write_flashlfq.get(), "fasta_file": self.fasta_file.get(), "protein_kwargs": self._parse_protein_kwargs(self.protein_kwargs.get()), } From 205a9ee894ae4ada8168946856df4d1df855f19f Mon Sep 17 00:00:00 2001 From: RalfG Date: Thu, 5 Sep 2024 11:25:41 +0200 Subject: [PATCH 3/4] Update default tims config --- ms2rescore/package_data/config_default_tims.json | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ms2rescore/package_data/config_default_tims.json b/ms2rescore/package_data/config_default_tims.json index 2a77adf..89913cc 100644 --- a/ms2rescore/package_data/config_default_tims.json +++ b/ms2rescore/package_data/config_default_tims.json @@ -16,8 +16,7 @@ "rescoring_engine": { "mokapot": { "write_weights": true, - "write_txt": true, - "write_flashlfq": true + "write_txt": true } }, "psm_file": null From 2fc955296f8d4b262f3f8ef6609ae7f50a699f9a Mon Sep 17 00:00:00 2001 From: RalfG Date: Thu, 5 Sep 2024 11:31:52 +0200 Subject: [PATCH 4/4] Final tweaks for flashlfq move: update msgfplus example; remove run "na" setting for mokapot flashlfq --- examples/msgfplus-ms2rescore.json | 3 --- examples/msgfplus-ms2rescore.toml | 18 ------------------ ms2rescore/rescoring_engines/mokapot.py | 8 ++------ 3 files changed, 2 insertions(+), 27 deletions(-) diff --git a/examples/msgfplus-ms2rescore.json b/examples/msgfplus-ms2rescore.json index ee6ec2a..8088b72 100644 --- a/examples/msgfplus-ms2rescore.json +++ b/examples/msgfplus-ms2rescore.json @@ -7,9 +7,6 @@ }, "log_level": "debug", "processes": 16, - "feature_generators": { - "basic": {} - }, "rescoring_engine": { "mokapot": { "fasta_file": "examples/proteins/uniprot-proteome-human-contaminants.fasta", diff --git a/examples/msgfplus-ms2rescore.toml b/examples/msgfplus-ms2rescore.toml index 805a361..533d973 100644 --- a/examples/msgfplus-ms2rescore.toml +++ b/examples/msgfplus-ms2rescore.toml @@ -5,25 +5,7 @@ psm_reader_kwargs = { "score_column" = "PSMScore" } log_level = "debug" processes = 16 -# [ms2rescore.modification_mapping] - -# [ms2rescore.fixed_modifications] - -[ms2rescore.feature_generators.basic] -# No options, but setting heading enables feature generator - -# [ms2rescore.feature_generators.ms2pip] -# model = "HCD" -# ms2_tolerance = 0.02 - -# [ms2rescore.feature_generators.deeplc] -# deeplc_retrain = false - -# [ms2rescore.feature_generators.maxquant] -# No options, but setting heading enables feature generator - [ms2rescore.rescoring_engine.mokapot] fasta_file = "examples/proteins/uniprot-proteome-human-contaminants.fasta" write_weights = true write_txt = true -# write_flashlfq = true diff --git a/ms2rescore/rescoring_engines/mokapot.py b/ms2rescore/rescoring_engines/mokapot.py index f2d7218..967c40b 100644 --- a/ms2rescore/rescoring_engines/mokapot.py +++ b/ms2rescore/rescoring_engines/mokapot.py @@ -85,8 +85,8 @@ def rescore( if "write_flashlfq" in kwargs: _ = kwargs.pop("write_flashlfq") logger.warning( - "The Mokapot `write_flashlfq` argument has been deprecated. To write FlashLFQ generic " - "TSV, use the MS²Rescore-level `write_flashlfq` option instead." + "The `write_flashlfq` argument has moved. To write FlashLFQ generic TSV, use the " + "MS²Rescore-level `write_flashlfq` option instead." ) # Convert PSMList to Mokapot dataset @@ -166,10 +166,6 @@ def convert_psm_list( feature_df.columns = [f"feature:{f}" for f in feature_df.columns] combined_df = pd.concat([psm_df[required_columns], feature_df], axis=1) - # Ensure filename for FlashLFQ txt output - if not combined_df["run"].notnull().all(): - combined_df["run"] = "na" - feature_names = [f"feature:{f}" for f in feature_names] if feature_names else None lin_psm_data = LinearPsmDataset(