Merge pull request #186 from compomics/feature/direct-flashlfq-output

Switch to FlashLFQ output from psm_utils
compomics · Sep 19, 2024 · 3950020 · 3950020
2 parents b1e23f3 + b5f3d78
commit 3950020
Show file tree

Hide file tree

Showing 9 changed files with 46 additions and 58 deletions.
diff --git a/docs/source/config_schema.md b/docs/source/config_schema.md
@@ -67,6 +67,7 @@
     - **One of**
       - *string*
       - *null*
+  - **`write_flashlfq`** *(boolean)*: Write results to a FlashLFQ-compatible file. Default: `false`.
   - **`write_report`** *(boolean)*: Write an HTML report with various QC metrics and charts. Default: `false`.
   - **`profile`** *(boolean)*: Write a txt report using cProfile for profiling. Default: `false`.
 ## Definitions
@@ -93,7 +94,6 @@
   - **`train_fdr`** *(number)*: FDR threshold for training Mokapot. Minimum: `0`. Maximum: `1`. Default: `0.01`.
   - **`write_weights`** *(boolean)*: Write Mokapot weights to a text file. Default: `false`.
   - **`write_txt`** *(boolean)*: Write Mokapot results to a text file. Default: `false`.
-  - **`write_flashlfq`** *(boolean)*: Write Mokapot results to a FlashLFQ-compatible file. Default: `false`.
 - <a id="definitions/percolator"></a>**`percolator`** *(object)*: Percolator rescoring engine configuration. Can contain additional properties. Refer to *[#/definitions/rescoring_engine](#definitions/rescoring_engine)*.
   - **`init-weights`**: Weights file for scoring function. Default: `false`.
     - **One of**

diff --git a/examples/msgfplus-ms2rescore.json b/examples/msgfplus-ms2rescore.json
@@ -7,9 +7,6 @@
         },
         "log_level": "debug",
         "processes": 16,
-        "feature_generators": {
-            "basic": {}
-        },
         "rescoring_engine": {
             "mokapot": {
                 "fasta_file": "examples/proteins/uniprot-proteome-human-contaminants.fasta",

diff --git a/examples/msgfplus-ms2rescore.toml b/examples/msgfplus-ms2rescore.toml
@@ -5,25 +5,7 @@ psm_reader_kwargs = { "score_column" = "PSMScore" }
 log_level = "debug"
 processes = 16
 
-# [ms2rescore.modification_mapping]
-
-# [ms2rescore.fixed_modifications]
-
-[ms2rescore.feature_generators.basic]
-# No options, but setting heading enables feature generator
-
-# [ms2rescore.feature_generators.ms2pip]
-# model = "HCD"
-# ms2_tolerance = 0.02
-
-# [ms2rescore.feature_generators.deeplc]
-# deeplc_retrain = false
-
-# [ms2rescore.feature_generators.maxquant]
-# No options, but setting heading enables feature generator
-
 [ms2rescore.rescoring_engine.mokapot]
 fasta_file = "examples/proteins/uniprot-proteome-human-contaminants.fasta"
 write_weights = true
 write_txt = true
-# write_flashlfq = true
diff --git a/ms2rescore/core.py b/ms2rescore/core.py
@@ -163,6 +163,16 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
     logger.info(f"Writing output to {output_file_root}.psms.tsv...")
     psm_utils.io.write_file(psm_list, output_file_root + ".psms.tsv", filetype="tsv")
 
+    if config["write_flashlfq"]:
+        logger.info(f"Writing output to {output_file_root}.flashlfq.tsv...")
+        psm_utils.io.write_file(
+            psm_list,
+            output_file_root + ".flashlfq.tsv",
+            filetype="flashlfq",
+            fdr_threshold=0.01,
+            only_target=True,  # TODO: Make FDR threshold configurable
+        )
+
     # Write report
     if config["write_report"]:
         try:

diff --git a/ms2rescore/gui/app.py b/ms2rescore/gui/app.py
@@ -370,6 +370,17 @@ def __init__(self, *args, **kwargs):
         )
         self.usi.grid(row=1, column=0, pady=(0, 10), sticky="nsew")
 
+        self.write_flashlfq = widgets.LabeledSwitch(
+            self,
+            label="Write FlashLFQ input file",
+            description=(
+                "Write a file that can be used as input for FlashLFQ. This file only contains "
+                "target PSMs that pass the FDR threshold."
+            ),
+            wraplength=CONFIG_WIDTH - 180,
+        )
+        self.write_flashlfq.grid(row=2, column=0, pady=(0, 10), sticky="nsew")
+
         self.generate_report = widgets.LabeledSwitch(
             self,
             label="Generate interactive report",
@@ -380,7 +391,7 @@ def __init__(self, *args, **kwargs):
             wraplength=CONFIG_WIDTH - 180,
             default=True,
         )
-        self.generate_report.grid(row=2, column=0, pady=(0, 10), sticky="nsew")
+        self.generate_report.grid(row=3, column=0, pady=(0, 10), sticky="nsew")
 
         self.id_decoy_pattern = widgets.LabeledEntry(
             self,
@@ -392,7 +403,7 @@ def __init__(self, *args, **kwargs):
             ),
             wraplength=CONFIG_WIDTH - 180,
         )
-        self.id_decoy_pattern.grid(row=3, column=0, pady=(0, 10), sticky="nsew")
+        self.id_decoy_pattern.grid(row=4, column=0, pady=(0, 10), sticky="nsew")
 
         self.psm_id_pattern = widgets.LabeledEntry(
             self,
@@ -404,7 +415,7 @@ def __init__(self, *args, **kwargs):
             ),
             wraplength=CONFIG_WIDTH - 180,
         )
-        self.psm_id_pattern.grid(row=4, column=0, pady=(0, 10), sticky="nsew")
+        self.psm_id_pattern.grid(row=5, column=0, pady=(0, 10), sticky="nsew")
 
         self.spectrum_id_pattern = widgets.LabeledEntry(
             self,
@@ -414,7 +425,7 @@ def __init__(self, *args, **kwargs):
             ),
             wraplength=CONFIG_WIDTH - 180,
         )
-        self.spectrum_id_pattern.grid(row=5, column=0, pady=(0, 10), sticky="nsew")
+        self.spectrum_id_pattern.grid(row=6, column=0, pady=(0, 10), sticky="nsew")
 
         self.processes = widgets.LabeledOptionMenu(
             self,
@@ -428,7 +439,7 @@ def __init__(self, *args, **kwargs):
             values=[str(x) for x in list(range(1, min(16, multiprocessing.cpu_count()) + 1))],
             default_value=str(min(16, multiprocessing.cpu_count())),
         )
-        self.processes.grid(row=6, column=0, pady=(0, 10), sticky="nsew")
+        self.processes.grid(row=7, column=0, pady=(0, 10), sticky="nsew")
 
         self.file_prefix = widgets.LabeledFileSelect(
             self,
@@ -441,7 +452,7 @@ def __init__(self, *args, **kwargs):
             ),
             wraplength=CONFIG_WIDTH - 20,
         )
-        self.file_prefix.grid(row=7, column=0, columnspan=2, sticky="nsew")
+        self.file_prefix.grid(row=8, column=0, columnspan=2, sticky="nsew")
 
         self.config_file = widgets.LabeledFileSelect(
             self,
@@ -453,13 +464,14 @@ def __init__(self, *args, **kwargs):
             ),
             wraplength=CONFIG_WIDTH - 20,
         )
-        self.config_file.grid(row=8, column=0, columnspan=2, sticky="nsew")
+        self.config_file.grid(row=9, column=0, columnspan=2, sticky="nsew")
 
     def get(self) -> Dict:
         """Get the configured values as a dictionary."""
         return {
             "lower_score_is_better": bool(int(self.lower_score.get())),  # str repr of 0 or 1
             "rename_to_usi": self.usi.get(),
+            "write_flashlfq": self.write_flashlfq.get(),
             "write_report": self.generate_report.get(),
             "id_decoy_pattern": self.id_decoy_pattern.get(),
             "psm_id_pattern": self.psm_id_pattern.get(),
@@ -732,12 +744,6 @@ def __init__(self, *args, **kwargs):
         self.write_txt.grid(row=row_n, column=0, pady=(0, 10), sticky="nsew")
         row_n += 1
 
-        self.write_flashlfq = widgets.LabeledSwitch(
-            self, label="Write file for FlashLFQ", default=False
-        )
-        self.write_flashlfq.grid(row=row_n, column=0, pady=(0, 10), sticky="nsew")
-        row_n += 1
-
         self.fasta_file = widgets.LabeledFileSelect(
             self,
             label="Select FASTA file (optional, required for protein inference)",
@@ -760,7 +766,6 @@ def get(self) -> Dict:
         config = {
             "write_weights": self.write_weights.get(),
             "write_txt": self.write_txt.get(),
-            "write_flashlfq": self.write_flashlfq.get(),
             "fasta_file": self.fasta_file.get(),
             "protein_kwargs": self._parse_protein_kwargs(self.protein_kwargs.get()),
         }

diff --git a/ms2rescore/package_data/config_default.json b/ms2rescore/package_data/config_default.json
@@ -16,8 +16,7 @@
             "mokapot": {
                 "train_fdr": 0.01,
                 "write_weights": true,
-                "write_txt": true,
-                "write_flashlfq": true
+                "write_txt": true
             }
         },
         "config_file": null,
@@ -40,6 +39,7 @@
         "processes": -1,
         "rename_to_usi": false,
         "fasta_file": null,
+        "write_flashlfq": false,
         "write_report": false
     }
 }
diff --git a/ms2rescore/package_data/config_default_tims.json b/ms2rescore/package_data/config_default_tims.json
@@ -16,8 +16,7 @@
         "rescoring_engine": {
             "mokapot": {
                 "write_weights": true,
-                "write_txt": true,
-                "write_flashlfq": true
+                "write_txt": true
             }
         },
         "psm_file": null

diff --git a/ms2rescore/package_data/config_schema.json b/ms2rescore/package_data/config_schema.json
@@ -169,6 +169,11 @@
                     "description": "Path to FASTA file with protein sequences to use for protein inference",
                     "oneOf": [{ "type": "string" }, { "type": "null" }]
                 },
+                "write_flashlfq": {
+                    "description": "Write results to a FlashLFQ-compatible file",
+                    "type": "boolean",
+                    "default": false
+                },
                 "write_report": {
                     "description": "Write an HTML report with various QC metrics and charts",
                     "type": "boolean",
@@ -295,11 +300,6 @@
                     "description": "Write Mokapot results to a text file",
                     "type": "boolean",
                     "default": false
-                },
-                "write_flashlfq": {
-                    "description": "Write Mokapot results to a FlashLFQ-compatible file",
-                    "type": "boolean",
-                    "default": false
                 }
             }
         },

diff --git a/ms2rescore/rescoring_engines/mokapot.py b/ms2rescore/rescoring_engines/mokapot.py
@@ -45,7 +45,6 @@ def rescore(
     train_fdr: float = 0.01,
     write_weights: bool = False,
     write_txt: bool = False,
-    write_flashlfq: bool = False,
     protein_kwargs: Optional[Dict[str, Any]] = None,
     **kwargs: Any,
 ) -> None:
@@ -57,8 +56,7 @@ def rescore(
     :py:class:`~mokapot.dataset.LinearPsmDataset`, and then optionally adds protein information
     from a FASTA file. The dataset is then passed to the :py:func:`~mokapot.brew` function, which
     returns the new scores, q-values, and PEPs. These are then written back to the original
-    :py:class:`~psm_utils.psm_list.PSMList`. Optionally, results can be written to a Mokapot text
-    file, a FlashLFQ-compatible file, or the model weights can be saved.
+    :py:class:`~psm_utils.psm_list.PSMList`.
 
     Parameters
     ----------
@@ -75,8 +73,6 @@ def rescore(
         Write model weights to a text file. Defaults to ``False``.
     write_txt
         Write Mokapot results to a text file. Defaults to ``False``.
-    write_flashlfq
-        Write Mokapot results to a FlashLFQ-compatible file. Defaults to ``False``.
     protein_kwargs
         Keyword arguments to pass to the :py:meth:`~mokapot.dataset.LinearPsmDataset.add_proteins`
         method.
@@ -86,6 +82,13 @@ def rescore(
     """
     _set_log_levels()
 
+    if "write_flashlfq" in kwargs:
+        _ = kwargs.pop("write_flashlfq")
+        logger.warning(
+            "The `write_flashlfq` argument has moved. To write FlashLFQ generic TSV, use the "
+            "MS²Rescore-level `write_flashlfq` option instead."
+        )
+
     # Convert PSMList to Mokapot dataset
     lin_psm_data = convert_psm_list(psm_list)
     feature_names = list(lin_psm_data.features.columns)
@@ -119,10 +122,6 @@ def rescore(
             )
     if write_txt:
         confidence_results.to_txt(file_root=output_file_root, decoys=True)
-    if write_flashlfq:
-        # TODO: How do we validate that the RTs are in minutes?
-        confidence_results.psms["retention_time"] = confidence_results.psms["retention_time"] * 60
-        confidence_results.to_flashlfq(output_file_root + ".mokapot.flashlfq.txt")
 
 
 def convert_psm_list(
@@ -167,10 +166,6 @@ def convert_psm_list(
     feature_df.columns = [f"feature:{f}" for f in feature_df.columns]
     combined_df = pd.concat([psm_df[required_columns], feature_df], axis=1)
 
-    # Ensure filename for FlashLFQ txt output
-    if not combined_df["run"].notnull().all():
-        combined_df["run"] = "na"
-
     feature_names = [f"feature:{f}" for f in feature_names] if feature_names else None
 
     lin_psm_data = LinearPsmDataset(