diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index c03b1667..7172c303 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -29,7 +29,7 @@ jobs: - name: Test built package run: | - pip install dist/ms2rescore-*.whl + pip install --only-binary :all: dist/ms2rescore-*.whl # pytest ms2rescore --help @@ -54,7 +54,7 @@ jobs: - name: Install package and dependencies run: | python -m pip install --upgrade pip - pip install . pyinstaller + pip install --only-binary :all: . pyinstaller - name: Install Inno Setup uses: crazy-max/ghaction-chocolatey@v3 diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 339aade3..b2643eec 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -35,11 +35,12 @@ jobs: - name: Build and install ms2rescore package run: | - pip install .[dev] + pip install --only-binary :all: .[dev] + + - name: Test with pytest + run: | + pytest - # - name: Test with pytest - # run: | - # pytest - name: Test installation run: | ms2rescore --help diff --git a/Dockerfile b/Dockerfile index 474f6f5d..847f6a4a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,6 @@ -FROM ubuntu:focal +FROM python:3.11 + +# ARG DEBIAN_FRONTEND=noninteractive LABEL name="ms2rescore" @@ -11,8 +13,7 @@ ADD MANIFEST.in /ms2rescore/MANIFEST.in ADD ms2rescore /ms2rescore/ms2rescore RUN apt-get update \ - && apt-get install --no-install-recommends -y python3-pip procps libglib2.0-0 libsm6 libxrender1 libxext6 \ - && rm -rf /var/lib/apt/lists/* \ - && pip3 install ms2rescore/ + && apt install -y procps \ + && pip install /ms2rescore --only-binary :all: ENTRYPOINT [""] diff --git a/docs/source/config_schema.md b/docs/source/config_schema.md index 2b523aa5..f145421c 100644 --- a/docs/source/config_schema.md +++ b/docs/source/config_schema.md @@ -48,6 +48,8 @@ - *string* - *null* - **`lower_score_is_better`** *(boolean)*: Bool indicating if lower score is better. Default: `false`. + - **`max_psm_rank_input`** *(number)*: Maximum rank of PSMs to use as input for rescoring. Minimum: `1`. Default: `10`. + - **`max_psm_rank_output`** *(number)*: Maximum rank of PSMs to return after rescoring, before final FDR calculation. Minimum: `1`. Default: `1`. - **`modification_mapping`** *(object)*: Mapping of modification labels to each replacement label. Default: `{}`. - **`fixed_modifications`** *(object)*: Mapping of amino acids with fixed modifications to the modification name. Can contain additional properties. Default: `{}`. - **`processes`** *(number)*: Number of parallel processes to use; -1 for all available. Minimum: `-1`. Default: `-1`. @@ -57,6 +59,7 @@ - *string* - *null* - **`write_report`** *(boolean)*: Write an HTML report with various QC metrics and charts. Default: `false`. + - **`profile`** *(boolean)*: Write a txt report using cProfile for profiling. Default: `false`. ## Definitions - **`feature_generator`** *(object)*: Feature generator configuration. Can contain additional properties. @@ -76,6 +79,7 @@ - **`reference_dataset`** *(string)*: Path to Ionmob reference dataset file. Default: `"Meier_unimod.parquet"`. - **`tokenizer`** *(string)*: Path to tokenizer json file. Default: `"tokenizer.json"`. - **`mokapot`** *(object)*: Mokapot rescoring engine configuration. Additional properties are passed to the Mokapot brew function. Can contain additional properties. Refer to *[#/definitions/rescoring_engine](#definitions/rescoring_engine)*. + - **`train_fdr`** *(number)*: FDR threshold for training Mokapot. Minimum: `0`. Maximum: `1`. Default: `0.01`. - **`write_weights`** *(boolean)*: Write Mokapot weights to a text file. Default: `false`. - **`write_txt`** *(boolean)*: Write Mokapot results to a text file. Default: `false`. - **`write_flashlfq`** *(boolean)*: Write Mokapot results to a FlashLFQ-compatible file. Default: `false`. diff --git a/docs/source/userguide/configuration.rst b/docs/source/userguide/configuration.rst index 52036aa7..e22b3ec1 100644 --- a/docs/source/userguide/configuration.rst +++ b/docs/source/userguide/configuration.rst @@ -123,7 +123,7 @@ be configured separately. For instance: .. code-block:: json "fixed_modifications": { - "C": "U:Carbamidomethyl" + "U:Carbamidomethyl": ["C"] } .. tab:: TOML @@ -131,7 +131,7 @@ be configured separately. For instance: .. code-block:: toml [ms2rescore.fixed_modifications] - "Carbamidomethyl" = ["C"] + "U:Carbamidomethyl" = ["C"] .. tab:: GUI @@ -140,6 +140,28 @@ be configured separately. For instance: :alt: fixed modifications configuration in GUI +Fixed terminal modifications can be added by using the special labels ``N-term`` and ``C-term``. +For example, to additionally add TMT6plex to the N-terminus and lysine residues, the following +configuration can be used: + +.. tab:: JSON + + .. code-block:: json + + "fixed_modifications": { + "U:Carbamidomethyl": ["C"], + "U:TMT6plex": ["N-term", "K"] + } + +.. tab:: TOML + + .. code-block:: toml + + [ms2rescore.fixed_modifications] + "U:Carbamidomethyl" = ["C"] + "U:TMT6plex" = ["N-term", "K"] + + .. caution:: Most search engines DO return fixed modifications as part of the modified peptide sequences. In these cases, they must NOT be added to the ``fixed_modifications`` configuration. @@ -218,6 +240,65 @@ expression pattern that extracts the decoy status from the protein name: decoy_pattern = "DECOY_" +Multi-rank rescoring +==================== + +Some search engines, such as MaxQuant, report multiple candidate PSMs for the same spectrum. +MS²Rescore can rescore multiple candidate PSMs per spectrum. This allows for lower-ranking +candidate PSMs to become the top-ranked PSM after rescoring. This behavior can be controlled with +the ``max_psm_rank_input`` option. + +To ensure a correct FDR control after rescoring, MS²Rescore filters out lower-ranking PSMs before +final FDR calculation and writing the output files. To allow for lower-ranking PSMs to be included +in the final output - for instance, to consider chimeric spectra - the ``max_psm_rank_output`` +option can be used. + +For example, to rescore the top 5 PSMs per spectrum and output the best PSM after rescoring, +the following configuration can be used: + +.. tab:: JSON + + .. code-block:: json + + "max_psm_rank_input": 5 + "max_psm_rank_output": 1 + +.. tab:: TOML + + .. code-block:: toml + + max_psm_rank_input = 5 + max_psm_rank_output = 1 + + +Configuring rescoring engines +============================= + +MS²Rescore supports multiple rescoring engines, such as Mokapot and Percolator. The rescoring +engine can be selected and configured with the ``rescoring_engine`` option. For example, to use +Mokapot with a custom train_fdr of 0.1%, the following configuration can be used: + +.. tab:: JSON + + .. code-block:: json + + "rescoring_engine": { + "mokapot": { + "train_fdr": 0.001 + } + +.. tab:: TOML + + .. code-block:: toml + + [ms2rescore.rescoring_engine.mokapot] + train_fdr = 0.001 + + +All options for the rescoring engines can be found in the :ref:`ms2rescore.rescoring_engines` +section. + + All configuration options ========================= diff --git a/docs/source/userguide/input-files.rst b/docs/source/userguide/input-files.rst index 08129b7f..b556b520 100644 --- a/docs/source/userguide/input-files.rst +++ b/docs/source/userguide/input-files.rst @@ -5,16 +5,13 @@ Input files PSM file(s) =========== -The peptide-spectrum match (PSM) file is generally the output from a proteomics search engine. -This file serves as the main input to MS²Rescore. One or multiple PSM files can be provided at -once. Note that merging PSMs from different MS runs could have an impact on the correctness of -the FDR control. +The **peptide-spectrum match (PSM) file** is generally the output from a proteomics search engine. +This file serves as the main input to MS²Rescore. -Various PSM file types are supported. The type can be specified with the ``psm_file_type`` option. -Check the list of :py:mod:`psm_utils` tags in the -:external+psm_utils:ref:`supported file formats ` section. Depending on the -file extension, the file type can also be inferred from the file name. In that case, -``psm_file_type`` option can be set to ``infer``. +The PSM file should contain **all putative identifications** made by the search engine, including +both target and decoy PSMs. Ensure that the search engine was configured to include decoy entries +in the search database and was operated with **target-decoy competition** enabled (i.e., +considering both target and decoy sequences simultaneously during the search). .. attention:: As a general rule, MS²Rescore always needs access to **all target and decoy PSMs, without any @@ -22,6 +19,17 @@ file extension, the file type can also be inferred from the file name. In that c set to 100%. +One or multiple PSM files can be provided at once. Note that merging PSMs from different MS runs +could have an impact on the correctness of the FDR control. Combining multiple PSM files should +generally only be done for LC-fractionated mass spectrometry runs. + +Various PSM file types are supported. The type can be specified with the ``psm_file_type`` option. +Check the list of :py:mod:`psm_utils` tags in the +:external+psm_utils:ref:`supported file formats ` section. Depending on the +file extension, the file type can also be inferred from the file name. In that case, +``psm_file_type`` option can be set to ``infer``. + + Spectrum file(s) ================ diff --git a/docs/source/userguide/output-files.rst b/docs/source/userguide/output-files.rst index ee1a78f8..89c0363b 100644 --- a/docs/source/userguide/output-files.rst +++ b/docs/source/userguide/output-files.rst @@ -52,8 +52,8 @@ Rescoring engine files: | ``..weights.txt`` | Feature weights, showing feature usage in the rescoring run | +-------------------------------------------------------------+-------------------------------------------------------------+ -If no rescoring engine is selected (or if Percolator was selected), the following files will also -be written: +If no rescoring engine is selected, if Percolator was selected, or in DEBUG mode, the following +files will also be written: +-------------------------------------------------------------+-----------------------------------------------------------+ | File | Description | diff --git a/docs/source/userguide/tims2Rescore.rst b/docs/source/userguide/tims2Rescore.rst new file mode 100644 index 00000000..f9b9096f --- /dev/null +++ b/docs/source/userguide/tims2Rescore.rst @@ -0,0 +1,61 @@ +.. _timsrescore: + +TIMS²Rescore User Guide +======================= + +Introduction +------------ + +The `TIMS²Rescore` tool is a DDA-PASEF adapted version of `ms2rescore` that allows users to perform rescoring of peptide-spectrum matches (PSMs) acquired on Bruker instruments. This guide provides an overview of how to use `timsrescore` in `ms2rescore` effectively. + +Installation +------------ + +Before using `timsrescore`, ensure that you have `ms2rescore` installed on your system. You can install `ms2rescore` using the following command: + +.. code-block:: bash + + pip install ms2rescore + +Usage +----- + +To use `timsrescore`, follow these steps: + +1. Prepare your input files: + - Ensure that you have the necessary input files, including the PSM file spectrum files + - Make sure that the PSM file format from a supported search engine or a standard format like .mzid(:external+psm_utils:ref:`supported file formats `). + - Spectrum files can directly be given as .d or minitdf files from Bruker instruments or first converted to .mzML format. + +2. Run `timsrescore`: + - Open a terminal or command prompt. + - Navigate to the directory where your input files are located. + - Execute the following command: + + .. code-block:: bash + + timsrescore -p -s -o + + Replace ``, ``, and `` with the actual paths to your input and output files. + _NOTE_ By default timsTOF specific models will be used for predictions. Optionally you can further configure settings through a configuration file. For more information on configuring `timsrescore`, refer to the :doc:`configuration` tab in the user guide. + +3. Review the results: + - Once the `timsrescore` process completes, you will find the rescoring results in the specified output file or if not specified in the same directory as the input files + - If you want a detailed overview of the performance, you can either give the set `write_report` to `True` in the configuration file, use the `--write_report` option in the command line or run the following command: + + .. code-block:: bash + + ms2rescore-report + + Replace `` with the actual output prefix of the result files to the output file. + +Additional Options +------------------ + +`ms2rescore` provides additional options to customize the `timsrescore` process. You can explore these options by running the following command: + +.. code-block:: bash + + timsrescore --help + + diff --git a/ms2rescore/__init__.py b/ms2rescore/__init__.py index 509fc99e..606a1c8a 100644 --- a/ms2rescore/__init__.py +++ b/ms2rescore/__init__.py @@ -1,6 +1,6 @@ """MS²Rescore: Sensitive PSM rescoring with predicted MS² peak intensities and RTs.""" -__version__ = "3.0.2" +__version__ = "3.1.0-dev9" from warnings import filterwarnings diff --git a/ms2rescore/__main__.py b/ms2rescore/__main__.py index 4cac9122..70e384c2 100644 --- a/ms2rescore/__main__.py +++ b/ms2rescore/__main__.py @@ -1,6 +1,9 @@ """MS²Rescore: Sensitive PSM rescoring with predicted MS² peak intensities and RTs.""" import argparse +import cProfile +import importlib.resources +import json import logging import sys from pathlib import Path @@ -10,7 +13,7 @@ from rich.logging import RichHandler from rich.text import Text -from ms2rescore import __version__ +from ms2rescore import __version__, package_data from ms2rescore.config_parser import parse_configurations from ms2rescore.core import rescore from ms2rescore.exceptions import MS2RescoreConfigurationError @@ -33,19 +36,26 @@ CONSOLE = Console(record=True) -def _print_credits(): +def _print_credits(tims=False): """Print software credits to terminal.""" text = Text() text.append("\n") - text.append("MS²Rescore", style="bold link https://github.com/compomics/ms2rescore") + if tims: + text.append("TIMS²Rescore", style="bold link https://github.com/compomics/ms2rescore") + else: + text.append("MS²Rescore", style="bold link https://github.com/compomics/ms2rescore") text.append(f" (v{__version__})\n", style="bold") + if tims: + text.append("MS²Rescore tuned for Bruker timsTOF instruments.\n", style="italic") text.append("Developed at CompOmics, VIB / Ghent University, Belgium.\n") text.append("Please cite: ") text.append( - "Declercq et al. MCP (2022)", style="link https://doi.org/10.1016/j.mcpro.2022.100266" + "Buur & Declercq et al. JPR (2024)", + style="link https://doi.org/10.1021/acs.jproteome.3c00785", ) text.append("\n") - text.stylize("cyan") + if tims: + text.stylize("#006cb5") CONSOLE.print(text) @@ -130,6 +140,21 @@ def _argument_parser() -> argparse.ArgumentParser: dest="fasta_file", help="path to FASTA file", ) + parser.add_argument( + "--write-report", + # metavar="BOOL", + action="store_true", + dest="write_report", + help="boolean to enable profiling with cProfile", + ) + parser.add_argument( + "--profile", + # metavar="BOOL", + action="store_true", + # type=bool, + # dest="profile", + help="boolean to enable profiling with cProfile", + ) return parser @@ -152,18 +177,42 @@ def _setup_logging(passed_level: str, log_file: Union[str, Path]): ) -def main(): +def profile(fnc, filepath): + """A decorator that uses cProfile to profile a function""" + + def inner(*args, **kwargs): + with cProfile.Profile() as profiler: + return_value = fnc(*args, **kwargs) + profiler.dump_stats(filepath + ".profile.prof") + return return_value + + return inner + + +def main_tims(): + """Run MS²Rescore command-line interface in TIMS²Rescore mode.""" + main(tims=True) + + +def main(tims=False): """Run MS²Rescore command-line interface.""" - _print_credits() + _print_credits(tims) # Parse CLI arguments and configuration file parser = _argument_parser() cli_args = parser.parse_args() + + configurations = [] + if cli_args.config_file: + configurations.append(cli_args.config_file) + if tims: + configurations.append( + json.load(importlib.resources.open_text(package_data, "config_default_tims.json")) + ) + configurations.append(cli_args) + try: - if cli_args.config_file: - config = parse_configurations([cli_args.config_file, cli_args]) - else: - config = parse_configurations(cli_args) + config = parse_configurations(configurations) except MS2RescoreConfigurationError as e: LOGGER.critical(e) sys.exit(1) @@ -175,7 +224,11 @@ def main(): # Run MS²Rescore try: - rescore(configuration=config) + if cli_args.profile: + profiled_rescore = profile(rescore, config["ms2rescore"]["output_path"]) + profiled_rescore(configuration=config) + else: + rescore(configuration=config) except Exception as e: LOGGER.exception(e) sys.exit(1) diff --git a/ms2rescore/core.py b/ms2rescore/core.py index f777d620..170f1038 100644 --- a/ms2rescore/core.py +++ b/ms2rescore/core.py @@ -3,15 +3,18 @@ from multiprocessing import cpu_count from typing import Dict, Optional +import numpy as np import psm_utils.io +from mokapot.dataset import LinearPsmDataset from psm_utils import PSMList +from ms2rescore import exceptions from ms2rescore.feature_generators import FEATURE_GENERATORS from ms2rescore.parse_psms import parse_psms from ms2rescore.parse_spectra import get_missing_values from ms2rescore.report import generate from ms2rescore.rescoring_engines import mokapot, percolator -from ms2rescore import exceptions +from ms2rescore.rescoring_engines.mokapot import add_peptide_confidence, add_psm_confidence logger = logging.getLogger(__name__) @@ -44,7 +47,7 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: psm_list = parse_psms(config, psm_list) # Log #PSMs identified before rescoring - id_psms_before = _log_id_psms_before(psm_list) + id_psms_before = _log_id_psms_before(psm_list, max_rank=config["max_psm_rank_output"]) # Define feature names; get existing feature names from PSM file feature_names = dict() @@ -58,12 +61,8 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: f"PSMs already contain the following rescoring features: {psm_list_feature_names}" ) - # TODO: avoid hard coding feature generators in some way - rt_required = "deeplc" in config["feature_generators"] and None in psm_list["retention_time"] - im_required = "ionmob" in config["feature_generators"] and None in psm_list["ion_mobility"] - if rt_required or im_required: - logger.info("Parsing missing retention time and/or ion mobility values from spectra...") - get_missing_values(config, psm_list, missing_rt=rt_required, missing_im=im_required) + # Add missing precursor info from spectrum file if needed + psm_list = _fill_missing_precursor_info(psm_list, config) # Add rescoring features for fgen_name, fgen_config in config["feature_generators"].items(): @@ -107,8 +106,8 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: logging.debug(f"Creating USIs for {len(psm_list)} PSMs") psm_list["spectrum_id"] = [psm.get_usi(as_url=False) for psm in psm_list] - # If no rescoring engine is specified, write PSMs and features to PIN file - if not config["rescoring_engine"]: + # If no rescoring engine is specified or DEBUG, write PSMs and features to PIN file + if not config["rescoring_engine"] or config["log_level"] == "debug": logger.info(f"Writing added features to PIN file: {output_file_root}.psms.pin") psm_utils.io.write_file( psm_list, @@ -116,35 +115,49 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: filetype="percolator", feature_names=all_feature_names, ) + + if not config["rescoring_engine"]: + logger.info("No rescoring engine specified. Skipping rescoring.") return None # Rescore PSMs - if "percolator" in config["rescoring_engine"]: - percolator.rescore( - psm_list, - output_file_root=output_file_root, - log_level=config["log_level"], - processes=config["processes"], - percolator_kwargs=config["rescoring_engine"]["percolator"], - ) - elif "mokapot" in config["rescoring_engine"]: - if "fasta_file" not in config["rescoring_engine"]["mokapot"]: - config["rescoring_engine"]["mokapot"]["fasta_file"] = config["fasta_file"] - if "protein_kwargs" in config["rescoring_engine"]["mokapot"]: - protein_kwargs = config["rescoring_engine"]["mokapot"].pop("protein_kwargs") - else: - protein_kwargs = dict() - - mokapot.rescore( - psm_list, - output_file_root=output_file_root, - protein_kwargs=protein_kwargs, - **config["rescoring_engine"]["mokapot"], - ) - else: - logger.info("No known rescoring engine specified. Skipping rescoring.") + try: + if "percolator" in config["rescoring_engine"]: + percolator.rescore( + psm_list, + output_file_root=output_file_root, + log_level=config["log_level"], + processes=config["processes"], + percolator_kwargs=config["rescoring_engine"]["percolator"], + ) + elif "mokapot" in config["rescoring_engine"]: + if "fasta_file" not in config["rescoring_engine"]["mokapot"]: + config["rescoring_engine"]["mokapot"]["fasta_file"] = config["fasta_file"] + if "protein_kwargs" in config["rescoring_engine"]["mokapot"]: + protein_kwargs = config["rescoring_engine"]["mokapot"].pop("protein_kwargs") + else: + protein_kwargs = dict() + + mokapot.rescore( + psm_list, + output_file_root=output_file_root, + protein_kwargs=protein_kwargs, + **config["rescoring_engine"]["mokapot"], + ) + except exceptions.RescoringError as e: + # Write output + logger.info(f"Writing intermediary output to {output_file_root}.psms.tsv...") + psm_utils.io.write_file(psm_list, output_file_root + ".psms.tsv", filetype="tsv") - _log_id_psms_after(psm_list, id_psms_before) + # Reraise exception + raise e + + # Post-rescoring processing + if all(psm_list["pep"] == 1.0): + psm_list = _fix_constant_pep(psm_list) + psm_list = _filter_by_rank(psm_list, config["max_psm_rank_output"], False) + psm_list = _calculate_confidence(psm_list) + _ = _log_id_psms_after(psm_list, id_psms_before, max_rank=config["max_psm_rank_output"]) # Write output logger.info(f"Writing output to {output_file_root}.psms.tsv...") @@ -157,7 +170,60 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: output_file_root, psm_list=psm_list, feature_names=feature_names, use_txt_log=True ) except exceptions.ReportGenerationError as e: - logger.error(e) + logger.exception(e) + + +def _fill_missing_precursor_info(psm_list: PSMList, config: Dict) -> PSMList: + """Fill missing precursor info from spectrum file if needed.""" + # Check if required + # TODO: avoid hard coding feature generators in some way + rt_required = ("deeplc" in config["feature_generators"]) and any( + v is None or v == 0 or np.isnan(v) for v in psm_list["retention_time"] + ) + im_required = ( + "ionmob" in config["feature_generators"] or "im2deep" in config["feature_generators"] + ) and any(v is None or v == 0 or np.isnan(v) for v in psm_list["ion_mobility"]) + logger.debug(f"RT required: {rt_required}, IM required: {im_required}") + + # Add missing values + if rt_required or im_required: + logger.info("Parsing missing retention time and/or ion mobility values from spectra...") + get_missing_values(psm_list, config, rt_required=rt_required, im_required=im_required) + + # Check if values are now present + for value_name, required in [("retention_time", rt_required), ("ion_mobility", im_required)]: + if required and ( + 0.0 in psm_list[value_name] + or None in psm_list[value_name] + or np.isnan(psm_list[value_name]).any() + ): + if all(v is None or v == 0.0 or np.isnan(v) for v in psm_list[value_name]): + raise exceptions.MissingValuesError( + f"Could not find any '{value_name}' values in PSM or spectrum files. Disable " + f"feature generators that require '{value_name}' or ensure that the values are " + "present in the input files." + ) + else: + missing_value_psms = psm_list[ + [v is None or np.isnan(v) for v in psm_list[value_name]] + ] + logger.warning( + f"Found {len(missing_value_psms)} PSMs with missing '{value_name}' values. " + "These PSMs will be removed." + ) + psm_list = psm_list[ + [v is not None and not np.isnan(v) for v in psm_list[value_name]] + ] + + return psm_list + + +def _filter_by_rank(psm_list: PSMList, max_rank: int, lower_score_better: bool) -> PSMList: + """Filter PSMs by rank.""" + psm_list.set_ranks(lower_score_better=lower_score_better) + rank_filter = psm_list["rank"] <= max_rank + logger.info(f"Removed {sum(~rank_filter)} PSMs with rank >= {max_rank}.") + return psm_list[rank_filter] def _write_feature_names(feature_names, output_file_root): @@ -169,25 +235,79 @@ def _write_feature_names(feature_names, output_file_root): f.write(f"{fgen}\t{feature}\n") -def _log_id_psms_before(psm_list): +def _log_id_psms_before(psm_list: PSMList, fdr: float = 0.01, max_rank: int = 1) -> int: """Log #PSMs identified before rescoring.""" id_psms_before = ( - (psm_list["qvalue"] <= 0.01) & (psm_list["is_decoy"] == False) # noqa: E712 + (psm_list["qvalue"] <= 0.01) & (psm_list["rank"] <= max_rank) & (~psm_list["is_decoy"]) ).sum() - logger.info("Found %i identified PSMs at 1%% FDR before rescoring.", id_psms_before) + logger.info( + f"Found {id_psms_before} identified PSMs with rank <= {max_rank} at {fdr} FDR before " + "rescoring." + ) return id_psms_before -def _log_id_psms_after(psm_list, id_psms_before): +def _log_id_psms_after( + psm_list: PSMList, id_psms_before: int, fdr: float = 0.01, max_rank: int = 1 +) -> int: """Log #PSMs identified after rescoring.""" id_psms_after = ( - (psm_list["qvalue"] <= 0.01) & (psm_list["is_decoy"] == False) # noqa: E712 + (psm_list["qvalue"] <= 0.01) & (psm_list["rank"] <= max_rank) & (~psm_list["is_decoy"]) ).sum() diff = id_psms_after - id_psms_before diff_perc = diff / id_psms_before if id_psms_before > 0 else None diff_numbers = f"{diff} ({diff_perc:.2%})" if diff_perc is not None else str(diff) diff_word = "more" if diff > 0 else "less" - logger.info(f"Identified {diff_numbers} {diff_word} PSMs at 1% FDR after rescoring.") + logger.info( + f"Identified {diff_numbers} {diff_word} PSMs with rank <= {max_rank} at {fdr} FDR after " + "rescoring." + ) return id_psms_after + + +def _fix_constant_pep(psm_list: PSMList) -> PSMList: + """Workaround for broken PEP calculation if best PSM is decoy.""" + logger.warning( + "Attempting to fix constant PEP values by removing decoy PSMs that score higher than the " + "best target PSM." + ) + max_target_score = psm_list["score"][~psm_list["is_decoy"]].max() + higher_scoring_decoys = psm_list["is_decoy"] & (psm_list["score"] > max_target_score) + + if not higher_scoring_decoys.any(): + logger.warning("No decoys scoring higher than the best target found. Skipping fix.") + else: + psm_list = psm_list[~higher_scoring_decoys] + logger.warning(f"Removed {higher_scoring_decoys.sum()} decoy PSMs.") + + return psm_list + + +def _calculate_confidence(psm_list: PSMList) -> PSMList: + """ + Calculate scores, q-values, and PEPs for PSMs and peptides and add them to PSMList. + """ + # Minimal conversion to LinearPsmDataset + psm_df = psm_list.to_dataframe() + psm_df = psm_df.reset_index(drop=True).reset_index() + psm_df["peptide"] = ( + psm_df["peptidoform"].astype(str).str.replace(r"(/\d+$)", "", n=1, regex=True) + ) + psm_df["is_target"] = ~psm_df["is_decoy"] + lin_psm_data = LinearPsmDataset( + psms=psm_df[["index", "peptide", "is_target"]], + target_column="is_target", + spectrum_columns="index", # Use artificial index to allow multi-rank rescoring + peptide_column="peptide", + ) + + # Recalculate confidence + new_confidence = lin_psm_data.assign_confidence(scores=psm_list["score"]) + + # Add new confidence estimations to PSMList + add_psm_confidence(psm_list, new_confidence) + add_peptide_confidence(psm_list, new_confidence) + + return psm_list diff --git a/ms2rescore/exceptions.py b/ms2rescore/exceptions.py index f2b06323..ba5492a2 100644 --- a/ms2rescore/exceptions.py +++ b/ms2rescore/exceptions.py @@ -25,7 +25,19 @@ class ModificationParsingError(IDFileParsingError): pass +class MissingValuesError(MS2RescoreError): + """Missing values in PSMs and/or spectra.""" + + pass + + class ReportGenerationError(MS2RescoreError): """Error while generating report.""" pass + + +class RescoringError(MS2RescoreError): + """Error while rescoring PSMs.""" + + pass diff --git a/ms2rescore/feature_generators/ms2pip.py b/ms2rescore/feature_generators/ms2pip.py index 89603237..3882b3fc 100644 --- a/ms2rescore/feature_generators/ms2pip.py +++ b/ms2rescore/feature_generators/ms2pip.py @@ -53,6 +53,7 @@ def __init__( ms2_tolerance: float = 0.02, spectrum_path: Optional[str] = None, spectrum_id_pattern: str = "(.*)", + model_dir: Optional[str] = None, processes: 1, **kwargs, ) -> None: @@ -71,6 +72,8 @@ def __init__( spectrum_id_pattern : str, optional Regular expression pattern to extract spectrum ID from spectrum file. Defaults to :py:const:`.*`. + model_dir + Directory containing MS²PIP models. Defaults to :py:const:`None` (use MS²PIP default). processes : int, optional Number of processes to use. Defaults to 1. @@ -85,6 +88,7 @@ def __init__( self.ms2_tolerance = ms2_tolerance self.spectrum_path = spectrum_path self.spectrum_id_pattern = spectrum_id_pattern + self.model_dir = model_dir self.processes = processes @property @@ -189,11 +193,12 @@ def add_features(self, psm_list: PSMList) -> None: try: ms2pip_results = correlate( psms=psm_list_run, - spectrum_file=spectrum_filename, + spectrum_file=str(spectrum_filename), spectrum_id_pattern=self.spectrum_id_pattern, model=self.model, ms2_tolerance=self.ms2_tolerance, compute_correlations=False, + model_dir=self.model_dir, processes=self.processes, ) except NoMatchingSpectraFound as e: diff --git a/ms2rescore/package_data/config_default.json b/ms2rescore/package_data/config_default.json index 71f704b5..4e318da5 100644 --- a/ms2rescore/package_data/config_default.json +++ b/ms2rescore/package_data/config_default.json @@ -14,6 +14,7 @@ }, "rescoring_engine": { "mokapot": { + "train_fdr": 0.01, "write_weights": true, "write_txt": true, "write_flashlfq": true @@ -30,11 +31,13 @@ "psm_id_pattern": null, "spectrum_id_pattern": null, "lower_score_is_better": false, + "max_psm_rank_input": 10, + "max_psm_rank_output": 1, "modification_mapping": {}, "fixed_modifications": {}, "processes": -1, "rename_to_usi": false, "fasta_file": null, - "write_report": true + "write_report": false } } diff --git a/ms2rescore/package_data/config_default_tims.json b/ms2rescore/package_data/config_default_tims.json new file mode 100644 index 00000000..2a77adf1 --- /dev/null +++ b/ms2rescore/package_data/config_default_tims.json @@ -0,0 +1,25 @@ +{ + "$schema": "./config_schema.json", + "ms2rescore": { + "feature_generators": { + "basic": {}, + "ms2pip": { + "model": "timsTOF", + "ms2_tolerance": 0.02 + }, + "deeplc": { + "deeplc_retrain": false + }, + "im2deep": {}, + "maxquant": {} + }, + "rescoring_engine": { + "mokapot": { + "write_weights": true, + "write_txt": true, + "write_flashlfq": true + } + }, + "psm_file": null + } +} diff --git a/ms2rescore/package_data/config_schema.json b/ms2rescore/package_data/config_schema.json index a97c9a59..75b9284e 100644 --- a/ms2rescore/package_data/config_schema.json +++ b/ms2rescore/package_data/config_schema.json @@ -65,7 +65,11 @@ }, "psm_file": { "description": "Path to file with peptide-spectrum matches.", - "oneOf": [{ "type": "string" }, { "type": "null" }, { "type": "array", "items": { "type": "string" } }] + "oneOf": [ + { "type": "string" }, + { "type": "null" }, + { "type": "array", "items": { "type": "string" } } + ] }, "psm_file_type": { "description": "PSM file type. By default inferred from file extension.", @@ -112,6 +116,18 @@ "type": "boolean", "default": false }, + "max_psm_rank_input": { + "description": "Maximum rank of PSMs to use as input for rescoring", + "type": "number", + "default": 10, + "minimum": 1 + }, + "max_psm_rank_output": { + "description": "Maximum rank of PSMs to return after rescoring, before final FDR calculation", + "type": "number", + "default": 1, + "minimum": 1 + }, "modification_mapping": { "description": "Mapping of modification labels to each replacement label.", "type": "object", @@ -142,6 +158,11 @@ "description": "Write an HTML report with various QC metrics and charts", "type": "boolean", "default": false + }, + "profile": { + "description": "Write a txt report using cProfile for profiling", + "type": "boolean", + "default": false } } } @@ -230,6 +251,13 @@ "type": "object", "additionalProperties": true, "properties": { + "train_fdr": { + "description": "FDR threshold for training Mokapot", + "type": "number", + "minimum": 0, + "maximum": 1, + "default": 0.01 + }, "write_weights": { "description": "Write Mokapot weights to a text file", "type": "boolean", diff --git a/ms2rescore/parse_psms.py b/ms2rescore/parse_psms.py index b30539f3..3514210e 100644 --- a/ms2rescore/parse_psms.py +++ b/ms2rescore/parse_psms.py @@ -1,8 +1,8 @@ import logging import re -from itertools import chain -from typing import Dict, Union +from typing import Dict, Optional, Union +import numpy as np import psm_utils.io from psm_utils import PSMList @@ -24,10 +24,30 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList: PSMList object containing PSMs. If None, PSMs will be read from ``psm_file``. """ - # Read PSMs, find decoys, calculate q-values - psm_list = _read_psms(config, psm_list) - _find_decoys(config, psm_list) - _calculate_qvalues(config, psm_list) + # Read PSMs + try: + psm_list = _read_psms(config, psm_list) + except psm_utils.io.PSMUtilsIOException: + raise MS2RescoreConfigurationError( + "Error occurred while reading PSMs. Please check the 'psm_file' and " + "'psm_file_type' settings. See " + "https://ms2rescore.readthedocs.io/en/latest/userguide/input-files/" + " for more information." + ) + + # Filter by PSM rank + psm_list.set_ranks(config["lower_score_is_better"]) + rank_filter = psm_list["rank"] <= config["max_psm_rank_input"] + psm_list = psm_list[rank_filter] + logger.info(f"Removed {sum(~rank_filter)} PSMs with rank >= {config['max_psm_rank_input']}.") + + # Remove invalid AAs, find decoys, calculate q-values + psm_list = _remove_invalid_aa(psm_list) + _find_decoys(psm_list, config["id_decoy_pattern"]) + _calculate_qvalues(psm_list, config["lower_score_is_better"]) + if config["psm_id_rt_pattern"] or config["psm_id_im_pattern"]: + logger.debug("Parsing retention time and/or ion mobility from PSM identifier...") + _parse_values_from_spectrum_id(config, psm_list) # Store scoring values for comparison later for psm in psm_list: @@ -60,17 +80,13 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList: if config["psm_id_pattern"]: pattern = re.compile(config["psm_id_pattern"]) - logger.debug("Applying `psm_id_pattern`...") + logger.debug("Applying 'psm_id_pattern'...") logger.debug( - f"Parsing `{psm_list['spectrum_id'][0]}` to `{_match_psm_ids(psm_list['spectrum_id'][0], pattern)}`" + f"Parsing '{psm_list[0].spectrum_id}' to '{_match_psm_ids(psm_list[0].spectrum_id, pattern)}'" ) new_ids = [_match_psm_ids(old_id, pattern) for old_id in psm_list["spectrum_id"]] psm_list["spectrum_id"] = new_ids - # TODO: Temporary fix until implemented in psm_utils - # Ensure that spectrum IDs are strings (Pydantic 2.0 does not coerce int to str) - psm_list["spectrum_id"] = [str(spec_id) for spec_id in psm_list["spectrum_id"]] - return psm_list @@ -78,49 +94,30 @@ def _read_psms(config, psm_list): if isinstance(psm_list, PSMList): return psm_list else: - logger.info("Reading PSMs from file...") - current_file = 1 total_files = len(config["psm_file"]) - valid_psms_list = [] - total_psms = 0 - valid_psms = 0 - for psm_file in config["psm_file"]: + psm_list = [] + for current_file, psm_file in enumerate(config["psm_file"]): logger.info( - f"Reading PSMs from PSM file ({current_file}/{total_files}): `{psm_file}`..." + f"Reading PSMs from PSM file ({current_file+1}/{total_files}): '{psm_file}'..." ) - try: - id_file_psm_list = psm_utils.io.read_file( + psm_list.extend( + psm_utils.io.read_file( psm_file, filetype=config["psm_file_type"], show_progressbar=True, **config["psm_reader_kwargs"], ) - except psm_utils.io.PSMUtilsIOException: - raise MS2RescoreConfigurationError( - "Error occurred while reading PSMs. Please check the `psm_file` and " - "`psm_file_type` settings. See " - "https://ms2rescore.readthedocs.io/en/latest/userguide/input-files/" - " for more information." - ) - - total_psms += len(id_file_psm_list.psm_list) - for psm in id_file_psm_list.psm_list: - if not _has_invalid_aminoacids(psm): - valid_psms_list.append(psm) - valid_psms += 1 - current_file += 1 - if total_psms - valid_psms > 0: - logger.warning( - f"{total_psms - valid_psms} PSMs with invalid amino acids were removed." ) - return PSMList(psm_list=valid_psms_list) + logger.debug(f"Read {len(psm_list)} PSMs from '{psm_file}'.") + return PSMList(psm_list=psm_list) -def _find_decoys(config, psm_list): + +def _find_decoys(psm_list: PSMList, id_decoy_pattern: Optional[str] = None): """Find decoys in PSMs, log amount, and raise error if none found.""" logger.debug("Finding decoys...") - if config["id_decoy_pattern"]: - psm_list.find_decoys(config["id_decoy_pattern"]) + if id_decoy_pattern: + psm_list.find_decoys(id_decoy_pattern) n_psms = len(psm_list) percent_decoys = sum(psm_list["is_decoy"]) / n_psms * 100 @@ -129,18 +126,18 @@ def _find_decoys(config, psm_list): if not any(psm_list["is_decoy"]): raise MS2RescoreConfigurationError( "No decoy PSMs found. Please check if decoys are present in the PSM file and that " - "the `id_decoy_pattern` option is correct. See " + "the 'id_decoy_pattern' option is correct. See " "https://ms2rescore.readthedocs.io/en/latest/userguide/configuration/#selecting-decoy-psms" " for more information." ) -def _calculate_qvalues(config, psm_list): +def _calculate_qvalues(psm_list: PSMList, lower_score_is_better: bool): """Calculate q-values for PSMs if not present.""" # Calculate q-values if not present if None in psm_list["qvalue"]: logger.debug("Recalculating q-values...") - psm_list.calculate_qvalues(reverse=not config["lower_score_is_better"]) + psm_list.calculate_qvalues(reverse=not lower_score_is_better) def _match_psm_ids(old_id, regex_pattern): @@ -150,12 +147,50 @@ def _match_psm_ids(old_id, regex_pattern): return match[1] except (TypeError, IndexError): raise MS2RescoreConfigurationError( - f"`psm_id_pattern` could not be extracted from PSM spectrum IDs (i.e. {old_id})." + f"'psm_id_pattern' could not be extracted from PSM spectrum IDs (i.e. {old_id})." " Ensure that the regex contains a capturing group?" ) -def _has_invalid_aminoacids(psm): - """Check if a PSM contains invalid amino acids.""" +def _parse_values_from_spectrum_id( + psm_list: PSMList, + psm_id_rt_pattern: Optional[str] = None, + psm_id_im_pattern: Optional[str] = None, +): + """Parse retention time and or ion mobility values from the spectrum_id.""" + for pattern, label, key in zip( + [psm_id_rt_pattern, psm_id_im_pattern], + ["retention time", "ion mobility"], + ["retention_time", "ion_mobility"], + ): + if pattern: + logger.debug( + f"Parsing {label} from spectrum_id with regex pattern " f"{psm_id_rt_pattern}" + ) + try: + pattern = re.compile(pattern) + psm_list[key] = [ + float(pattern.search(psm.spectrum_id).group(1)) for psm in psm_list + ] + except AttributeError: + raise MS2RescoreConfigurationError( + f"Could not parse {label} from spectrum_id with the " + f"{pattern} regex pattern. " + f"Example spectrum_id: '{psm_list[0].spectrum_id}'\n. " + f"Please make sure the {label} key is present in the spectrum_id " + "and the value is in a capturing group or disable the relevant feature generator." + ) + + +def _remove_invalid_aa(psm_list: PSMList) -> PSMList: + """Remove PSMs with invalid amino acids.""" + invalid_psms = np.array( + [any(aa in "BJOUXZ" for aa in psm.peptidoform.sequence) for psm in psm_list] + ) - return any(aa not in "ACDEFGHIKLMNPQRSTVWY" for aa in psm.peptidoform.sequence) + if any(invalid_psms): + logger.warning(f"Removed {sum(invalid_psms)} PSMs with invalid amino acids.") + return psm_list[~invalid_psms] + else: + logger.debug("No PSMs with invalid amino acids found.") + return psm_list diff --git a/ms2rescore/parse_spectra.py b/ms2rescore/parse_spectra.py index 9ed199b9..2b2c1b5f 100644 --- a/ms2rescore/parse_spectra.py +++ b/ms2rescore/parse_spectra.py @@ -3,12 +3,9 @@ import logging import re from itertools import chain -from typing import Dict, Tuple +from ms2rescore_rs import get_precursor_info from psm_utils import PSMList -from pyteomics.mgf import MGF -from pyteomics.mzml import MzML -from rich.progress import track from ms2rescore.exceptions import MS2RescoreError from ms2rescore.utils import infer_spectrum_path @@ -16,122 +13,40 @@ logger = logging.getLogger(__name__) -def get_missing_values(config, psm_list, missing_rt=False, missing_im=False): +def get_missing_values( + psm_list: PSMList, config: dict, rt_required: bool = False, im_required: bool = False +): """Get missing RT/IM features from spectrum file.""" - logger.debug("Extracting missing RT/IM values from spectrum file(s).") - psm_dict = psm_list.get_psm_dict() for runs in psm_dict.values(): - for run, psms in track(runs.items(), description="Extracting RT/IM values..."): + for run, psms in runs.items(): psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values()))) spectrum_file = infer_spectrum_path(config["spectrum_path"], run) - - if spectrum_file.suffix.lower() == ".mzml": - rt_dict, im_dict = _parse_values_from_mzml( - spectrum_file, config, run, missing_rt, missing_im - ) - elif spectrum_file.suffix.lower() == ".mgf": - rt_dict, im_dict = _parse_values_from_mgf( - spectrum_file, config, run, missing_rt, missing_im - ) - - for value_dict, value in zip([rt_dict, im_dict], ["retention_time", "ion_mobility"]): - if value_dict: - try: - psm_list_run[value] = [value_dict[psm.spectrum_id] for psm in psm_list_run] - except KeyError: - raise ParsingError( - f"Could not parse {value} values from spectrum file for run {run}." - ) - - -def _parse_values_from_mgf( - spectrum_file, config, run, missing_rt, missing_im -) -> Tuple[Dict, Dict]: - """ - Parse retention time and/or ion mobility from an MGF file. - - Notes - ----- - - Extracting values (e.g., ion mobility) according to the Matrix documentation: - http://www.matrixscience.com/help/data_file_help.html - - """ - rt_dict = {} - im_dict = {} - - spectrum_id_pattern = re.compile( - config["spectrum_id_pattern"] if config["spectrum_id_pattern"] else r"(.*)" - ) - - for spectrum in MGF(str(spectrum_file)): - matched_id = spectrum_id_pattern.match(spectrum["params"]["title"]).group() - if missing_rt: - try: - rt_dict[matched_id] = float(spectrum["params"]["rtinseconds"]) - except KeyError: - raise ParsingError( - "Could not parse retention time (`rtinseconds`) from spectrum file for " - f"run {run}. Please make sure that the retention time key is present in the " - "spectrum file or disable the relevant feature generator." - ) - if missing_im: - try: - im_dict[matched_id] = float(spectrum["params"]["ion_mobility"]) - except KeyError: - raise ParsingError( - "Could not parse ion mobility (`ion_mobility`) from spectrum file " - f"for run {run}. Please make sure that the ion mobility key is present in the " - "spectrum file or disable the relevant feature generator." - ) - - return rt_dict, im_dict - - -def _parse_values_from_mzml( - spectrum_file, config, run, missing_rt, missing_im -) -> Tuple[Dict, Dict]: - """Parse retention time and/or ion mobility from an mzML file.""" - rt_dict = {} - im_dict = {} - - spectrum_id_pattern = re.compile( - config["spectrum_id_pattern"] if config["spectrum_id_pattern"] else r"(.*)" - ) - - for spectrum in MzML(str(spectrum_file)): - matched_id = spectrum_id_pattern.match(spectrum["id"]).group() - if missing_rt: - try: - rt_dict[matched_id] = float(spectrum["scanList"]["scan"][0]["scan start time"]) - except KeyError: - raise ParsingError( - "Could not parse retention time (`scan start time`) from spectrum file for " - f"run {run}. Please make sure that the retention time key is present in the " - "spectrum file or disable the relevant feature generator." - ) - if missing_im: - try: - im_dict[matched_id] = float( - spectrum["scanList"]["scan"][0]["reverse ion mobility"] - ) - except KeyError: - raise ParsingError( - "Could not parse ion mobility (`reverse ion mobility`) from spectrum file " - f"for run {run}. Please make sure that the ion mobility key is present in the " - "spectrum file or disable the relevant feature generator." - ) - - return rt_dict, im_dict - - -class ParseMGFError(MS2RescoreError): - """Error parsing MGF file.""" - - pass - - -class ParsingError(MS2RescoreError): + logger.debug("Reading spectrum file: '%s'", spectrum_file) + precursors = get_precursor_info(str(spectrum_file)) + + if config["spectrum_id_pattern"]: + spectrum_id_pattern = re.compile(config["spectrum_id_pattern"]) + precursors = { + spectrum_id_pattern.search(spectrum_id).group(1): precursor + for spectrum_id, precursor in precursors.items() + } + + for psm in psm_list_run: + try: + if rt_required: + psm.retention_time = precursors[psm.spectrum_id].rt + if im_required: + psm.ion_mobility = precursors[psm.spectrum_id].im + if not psm.precursor_mz: + psm.precursor_mz = precursors[psm.spectrum_id].mz + except KeyError as e: + raise SpectrumParsingError( + f"Could not extract missing RT/IM values from spectrum file for run {run}." + ) from e + + +class SpectrumParsingError(MS2RescoreError): """Error parsing retention time from spectrum file.""" pass diff --git a/ms2rescore/report/charts.py b/ms2rescore/report/charts.py index e601d826..669bcc62 100644 --- a/ms2rescore/report/charts.py +++ b/ms2rescore/report/charts.py @@ -198,6 +198,7 @@ def score_scatter_plot( after: mokapot.LinearConfidence, level: str = "psms", indexer: str = "index", + fdr_threshold: float = 0.01, ) -> go.Figure: """ Plot PSM scores before and after rescoring. @@ -214,6 +215,14 @@ def score_scatter_plot( Column with index for each PSM, peptide, or protein to use for merging data frames. """ + if not before or not after: + figure = go.Figure() + figure.add_annotation( + text="No data available for comparison.", + showarrow=False, + ) + return figure + # Restructure data merge_columns = [indexer, "mokapot score", "mokapot q-value", "mokapot PEP"] ce_psms_targets = pd.merge( @@ -233,16 +242,22 @@ def score_scatter_plot( ce_psms = pd.concat([ce_psms_targets, ce_psms_decoys], axis=0) # Get score thresholds - score_threshold_before = ( - ce_psms[ce_psms["mokapot q-value before"] <= 0.01] - .sort_values("mokapot q-value before", ascending=False)["mokapot score before"] - .iloc[0] - ) - score_threshold_after = ( - ce_psms[ce_psms["mokapot q-value after"] <= 0.01] - .sort_values("mokapot q-value after", ascending=False)["mokapot score after"] - .iloc[0] - ) + try: + score_threshold_before = ( + ce_psms[ce_psms["mokapot q-value before"] <= fdr_threshold] + .sort_values("mokapot q-value before", ascending=False)["mokapot score before"] + .iloc[0] + ) + except IndexError: # No PSMs below threshold + score_threshold_before = None + try: + score_threshold_after = ( + ce_psms[ce_psms["mokapot q-value after"] <= fdr_threshold] + .sort_values("mokapot q-value after", ascending=False)["mokapot score after"] + .iloc[0] + ) + except IndexError: # No PSMs below threshold + score_threshold_after = None # Plot fig = px.scatter( @@ -259,10 +274,12 @@ def score_scatter_plot( }, ) # draw FDR thresholds - fig.add_vline(x=score_threshold_before, line_dash="dash", row=1, col=1) - fig.add_hline(y=score_threshold_after, line_dash="dash", row=1, col=1) - fig.add_vline(x=score_threshold_before, line_dash="dash", row=2, col=1) - fig.add_hline(y=score_threshold_after, line_dash="dash", row=1, col=2) + if score_threshold_before: + fig.add_vline(x=score_threshold_before, line_dash="dash", row=1, col=1) + fig.add_vline(x=score_threshold_before, line_dash="dash", row=2, col=1) + if score_threshold_after: + fig.add_hline(y=score_threshold_after, line_dash="dash", row=1, col=1) + fig.add_hline(y=score_threshold_after, line_dash="dash", row=1, col=2) return fig @@ -288,6 +305,14 @@ def fdr_plot_comparison( Column with index for each PSM, peptide, or protein to use for merging dataframes. """ + if not before or not after: + figure = go.Figure() + figure.add_annotation( + text="No data available for comparison.", + showarrow=False, + ) + return figure + # Prepare data ce_psms_targets_melted = ( pd.merge( @@ -336,7 +361,7 @@ def fdr_plot_comparison( def identification_overlap( before: mokapot.LinearConfidence, after: mokapot.LinearConfidence, -) -> go.Figure(): +) -> go.Figure: """ Plot stacked bar charts of removed, retained, and gained PSMs, peptides, and proteins. @@ -348,8 +373,16 @@ def identification_overlap( Mokapot linear confidence results after rescoring. """ + if not before or not after: + figure = go.Figure() + figure.add_annotation( + text="No data available for comparison.", + showarrow=False, + ) + return figure + levels = before.levels # ["psms", "peptides", "proteins"] if all available - indexers = ["index", "index", "mokapot protein group"] + indexers = ["index", "peptide", "mokapot protein group"] overlap_data = defaultdict(dict) for level, indexer in zip(levels, indexers): @@ -362,7 +395,7 @@ def identification_overlap( set_after = set(df_after[df_after["mokapot q-value"] <= 0.01][indexer]) overlap_data["removed"][level] = -len(set_before - set_after) - overlap_data["retained"][level] = len(set_before | set_after) + overlap_data["retained"][level] = len(set_after.intersection(set_before)) overlap_data["gained"][level] = len(set_after - set_before) colors = ["#953331", "#316395", "#319545"] diff --git a/ms2rescore/report/generate.py b/ms2rescore/report/generate.py index 090db873..0ac092af 100644 --- a/ms2rescore/report/generate.py +++ b/ms2rescore/report/generate.py @@ -82,7 +82,7 @@ def generate_report( # Read config config = json.loads(files["configuration"].read_text()) - logger.info("Recalculating confidence estimates...") + logger.debug("Recalculating confidence estimates...") fasta_file = config["ms2rescore"]["fasta_file"] confidence_before, confidence_after = get_confidence_estimates(psm_list, fasta_file) @@ -139,7 +139,7 @@ def generate_report( def _collect_files(output_path_prefix, use_txt_log=False): """Collect all files generated by MS²Rescore.""" - logger.info("Collecting files...") + logger.debug("Collecting files...") files = { "PSMs": Path(output_path_prefix + ".psms.tsv").resolve(), "configuration": Path(output_path_prefix + ".full-config.json").resolve(), @@ -151,7 +151,7 @@ def _collect_files(output_path_prefix, use_txt_log=False): } for file, path in files.items(): if Path(path).is_file(): - logger.info("✅ Found %s: '%s'", file, path.as_posix()) + logger.debug("✅ Found %s: '%s'", file, path.as_posix()) else: logger.warning("❌ %s: '%s'", file, path.as_posix()) files[file] = None @@ -164,6 +164,11 @@ def _get_stats_context(confidence_before, confidence_after): levels = ["psms", "peptides", "proteins"] level_names = ["PSMs", "Peptides", "Protein groups"] card_colors = ["card-bg-blue", "card-bg-green", "card-bg-red"] + + # Cannot report stats if confidence estimates are not present + if not confidence_before or not confidence_after: + return stats + for level, level_name, card_color in zip(levels, level_names, card_colors): try: before = confidence_before.accepted[level.lower()] @@ -178,7 +183,7 @@ def _get_stats_context(confidence_before, confidence_after): "item": level_name, "card_color": card_color, "number": after, - "diff": f"{after - before:+}", + "diff": f"({after - before:+})", "percentage": f"{increase:.1f}%", "is_increase": increase > 0, "bar_percentage": before / after * 100 if increase > 0 else after / before * 100, diff --git a/ms2rescore/report/utils.py b/ms2rescore/report/utils.py index e8568e59..069a641f 100644 --- a/ms2rescore/report/utils.py +++ b/ms2rescore/report/utils.py @@ -51,39 +51,26 @@ def get_confidence_estimates( "was generated by MS²Rescore. Could not generate report." ) from e + score_after = psm_list["score"] peptide = ( pd.Series(psm_list["peptidoform"]).astype(str).str.replace(r"(/\d+$)", "", n=1, regex=True) ) - psms = pd.DataFrame( - { - "peptide": peptide, - "is_target": ~psm_list["is_decoy"], - "before": score_before, - "after": psm_list["score"], - } - ).reset_index() - + psms = pd.DataFrame({"peptide": peptide, "is_target": ~psm_list["is_decoy"]}).reset_index() + lin_psm_dataset = LinearPsmDataset( + psms=psms, + target_column="is_target", + spectrum_columns="index", + peptide_column="peptide", + ) if fasta_file: fasta = read_fasta(fasta_file) + lin_psm_dataset.add_proteins(fasta) confidence = dict() - for when in ["before", "after"]: - lin_psm_dataset = LinearPsmDataset( - psms=psms, - target_column="is_target", - spectrum_columns="index", - feature_columns=[when], - peptide_column="peptide", - ) - if fasta_file: - lin_psm_dataset.add_proteins(fasta) - + for when, scores in [("before", score_before), ("after", score_after)]: try: - confidence[when] = lin_psm_dataset.assign_confidence() - except RuntimeError as e: - raise ReportGenerationError( - f"Error while assigning confidence estimates to PSMs ({when} rescoring). " - "Could not generate report." - ) from e + confidence[when] = lin_psm_dataset.assign_confidence(scores=scores) + except RuntimeError: + confidence[when] = None return confidence["before"], confidence["after"] diff --git a/ms2rescore/rescoring_engines/mokapot.py b/ms2rescore/rescoring_engines/mokapot.py index f3927f47..f02d877f 100644 --- a/ms2rescore/rescoring_engines/mokapot.py +++ b/ms2rescore/rescoring_engines/mokapot.py @@ -20,7 +20,8 @@ """ import logging -from typing import Any, List, Optional, Tuple, Dict +import re +from typing import Any, Dict, List, Optional, Tuple import mokapot import numpy as np @@ -28,15 +29,20 @@ import psm_utils from mokapot.brew import brew from mokapot.dataset import LinearPsmDataset +from mokapot.model import PercolatorModel from pyteomics.mass import nist_mass +from ms2rescore.exceptions import RescoringError + logger = logging.getLogger(__name__) +logging.getLogger("numba").setLevel(logging.WARNING) def rescore( psm_list: psm_utils.PSMList, output_file_root: str = "ms2rescore", fasta_file: Optional[str] = None, + train_fdr: float = 0.01, write_weights: bool = False, write_txt: bool = False, write_flashlfq: bool = False, @@ -63,6 +69,8 @@ def rescore( fasta_file Path to FASTA file with protein sequences to use for protein inference. Defaults to ``None``. + train_fdr + FDR to use for training the Mokapot model. Defaults to ``0.01``. write_weights Write model weights to a text file. Defaults to ``False``. write_txt @@ -89,27 +97,15 @@ def rescore( # Rescore logger.debug(f"Mokapot brew options: `{kwargs}`") - confidence_results, models = brew(lin_psm_data, **kwargs) - - # Reshape confidence estimates to match PSMList - mokapot_values_targets = ( - confidence_results.confidence_estimates["psms"] - .set_index("index") - .sort_index()[["mokapot score", "mokapot q-value", "mokapot PEP"]] - ) - mokapot_values_decoys = ( - confidence_results.decoy_confidence_estimates["psms"] - .set_index("index") - .sort_index()[["mokapot score", "mokapot q-value", "mokapot PEP"]] - ) - q = np.full((len(psm_list), 3), np.nan) - q[mokapot_values_targets.index] = mokapot_values_targets.values - q[mokapot_values_decoys.index] = mokapot_values_decoys.values + try: + confidence_results, models = brew( + lin_psm_data, model=PercolatorModel(train_fdr=train_fdr), rng=8, **kwargs + ) + except RuntimeError as e: + raise RescoringError("Mokapot could not be run. Please check the input data.") from e - # Add Mokapot results to PSMList - psm_list["score"] = q[:, 0] - psm_list["qvalue"] = q[:, 1] - psm_list["pep"] = q[:, 2] + add_psm_confidence(psm_list, confidence_results) + add_peptide_confidence(psm_list, confidence_results) # Write results if write_weights: @@ -171,6 +167,10 @@ def convert_psm_list( feature_df.columns = [f"feature:{f}" for f in feature_df.columns] combined_df = pd.concat([psm_df[required_columns], feature_df], axis=1) + # Ensure filename for FlashLFQ txt output + if not combined_df["run"].notnull().all(): + combined_df["run"] = "na" + feature_names = [f"feature:{f}" for f in feature_names] if feature_names else None lin_psm_data = LinearPsmDataset( @@ -220,6 +220,58 @@ def save_model_weights( ) +def add_psm_confidence( + psm_list: psm_utils.PSMList, confidence_results: mokapot.confidence.Confidence +) -> None: + """Add PSM-level confidence estimates to PSM list, updating score, qvalue, pep, and rank.""" + # Reshape confidence estimates to match PSMList + keys = ["mokapot score", "mokapot q-value", "mokapot PEP"] + mokapot_values_targets = ( + confidence_results.confidence_estimates["psms"].set_index("index").sort_index()[keys] + ) + mokapot_values_decoys = ( + confidence_results.decoy_confidence_estimates["psms"].set_index("index").sort_index()[keys] + ) + q = np.full((len(psm_list), 3), np.nan) + q[mokapot_values_targets.index] = mokapot_values_targets.values + q[mokapot_values_decoys.index] = mokapot_values_decoys.values + + # Add Mokapot results to PSMList + psm_list["score"] = q[:, 0] + psm_list["qvalue"] = q[:, 1] + psm_list["pep"] = q[:, 2] + + # Reset ranks to match new scores + psm_list.set_ranks(lower_score_better=False) + + +def add_peptide_confidence( + psm_list: psm_utils.PSMList, confidence_results: mokapot.confidence.Confidence +) -> None: + """Add Mokapot peptide-level confidence estimates to PSM list.""" + keys = ["mokapot score", "mokapot q-value", "mokapot PEP"] + peptide_info = pd.concat( + [ + confidence_results.confidence_estimates["peptides"].set_index("peptide")[keys], + confidence_results.decoy_confidence_estimates["peptides"].set_index("peptide")[keys], + ], + axis=0, + ).to_dict(orient="index") + + # Add peptide-level scores to PSM metadata + # run_key = "na" if not all(psm.run for psm in psm_list) else None + no_charge_pattern = re.compile(r"(/\d+$)") + for psm in psm_list: + peptide_scores = peptide_info[(no_charge_pattern.sub("", str(psm.peptidoform), 1))] + psm.metadata.update( + { + "peptide_score": peptide_scores["mokapot score"], + "peptide_qvalue": peptide_scores["mokapot q-value"], + "peptide_pep": peptide_scores["mokapot PEP"], + } + ) + + def _mz_to_mass(mz: float, charge: int) -> float: """Convert m/z to mass.""" return mz * charge - charge * nist_mass["H"][1][0] diff --git a/ms2rescore/rescoring_engines/percolator.py b/ms2rescore/rescoring_engines/percolator.py index 5f7d4e5d..9952aed5 100644 --- a/ms2rescore/rescoring_engines/percolator.py +++ b/ms2rescore/rescoring_engines/percolator.py @@ -20,8 +20,8 @@ import logging import subprocess from typing import Any, Dict, Optional +from copy import deepcopy -import numpy as np import psm_utils from ms2rescore.exceptions import MS2RescoreError @@ -103,8 +103,15 @@ def rescore( # Need to be able to link back to original PSMs, so reindex spectrum IDs, but copy PSM list # to avoid modifying original... # TODO: Better approach for this? - psm_list_reindexed = psm_list.copy() - psm_list_reindexed["spectrum_id"] = np.arange(len(psm_list_reindexed)) + + psm_list_reindexed = deepcopy(psm_list) + psm_list_reindexed.set_ranks() + psm_list_reindexed["spectrum_id"] = [ + f"{psm.get_usi(as_url=False)}_{psm.rank}" for psm in psm_list_reindexed + ] + spectrum_id_index = { + spectrum_id: index for index, spectrum_id in enumerate(psm_list_reindexed["spectrum_id"]) + } _write_pin_file(psm_list_reindexed, pin_filepath) @@ -134,10 +141,13 @@ def rescore( psm_list, percolator_kwargs["results-psms"], percolator_kwargs["decoy-results-psms"], + spectrum_id_index, ) -def _update_psm_scores(psm_list: psm_utils.PSMList, target_pout: str, decoy_pout: str): +def _update_psm_scores( + psm_list: psm_utils.PSMList, target_pout: str, decoy_pout: str, spectrum_id_index: list +): """ Update PSM scores with Percolator results. @@ -150,7 +160,9 @@ def _update_psm_scores(psm_list: psm_utils.PSMList, target_pout: str, decoy_pout psm_list_percolator = psm_utils.PSMList(psm_list=target_psms.psm_list + decoy_psms.psm_list) # Sort by reindexed spectrum_id so order matches original PSM list - psm_list_percolator[np.argsort(psm_list_percolator["spectrum_id"])] + psm_list_percolator = sorted( + psm_list_percolator, key=lambda psm: spectrum_id_index[psm["spectrum_id"]] + ) if not len(psm_list) == len(psm_list_percolator): raise MS2RescoreError( @@ -163,6 +175,8 @@ def _update_psm_scores(psm_list: psm_utils.PSMList, target_pout: str, decoy_pout original_psm["qvalue"] = new_psm["qvalue"] original_psm["pep"] = new_psm["pep"] + psm_list.set_ranks(lower_score_better=False) + def _write_pin_file(psm_list: psm_utils.PSMList, filepath: str): """Write PIN file for rescoring.""" diff --git a/ms2rescore/utils.py b/ms2rescore/utils.py index 70417b56..3515893a 100644 --- a/ms2rescore/utils.py +++ b/ms2rescore/utils.py @@ -35,44 +35,61 @@ def infer_spectrum_path( "and no run name in PSM file found." ) - # If passed path is directory, join with run name - elif os.path.isdir(configured_path): - if run_name: - resolved_path = os.path.join(configured_path, run_name) + else: + is_bruker_dir = configured_path.endswith(".d") or _is_minitdf(configured_path) + + # If passed path is directory (that is not Bruker raw), join with run name + if os.path.isdir(configured_path) and not is_bruker_dir: + if run_name: + resolved_path = os.path.join(configured_path, run_name) + else: + raise MS2RescoreConfigurationError( + "Could not resolve spectrum file name: Spectrum path is directory " + "but no run name in PSM file found." + ) + + # If passed path is file, use that, but warn if basename doesn't match expected + elif os.path.isfile(configured_path) or (os.path.isdir(configured_path) and is_bruker_dir): + if run_name and Path(configured_path).stem != Path(run_name).stem: + logger.warning( + "Passed spectrum path (`%s`) does not match run name found in PSM " + "file (`%s`). Continuing with passed spectrum path.", + configured_path, + run_name, + ) + resolved_path = configured_path else: raise MS2RescoreConfigurationError( - "Could not resolve spectrum file name: Spectrum path is directory " - "but no run name in PSM file found." - ) - - # If passed path is file, use that, but warn if basename doesn't match expected - elif os.path.isfile(configured_path): - if run_name and Path(configured_path).stem != Path(run_name).stem: - logger.warning( - "Passed spectrum path (`%s`) does not match run name found in PSM " - "file (`%s`). Continuing with passed spectrum path.", - configured_path, - run_name, + "Configured `spectrum_path` must be `None` or a path to an existing file " + "or directory. If `None` or path to directory, spectrum run information " + "should be present in the PSM file." ) - resolved_path = configured_path - else: - raise MS2RescoreConfigurationError( - "Configured `spectrum_path` must be `None` or a path to an existing file " - "or directory. If `None` or path to directory, spectrum run information " - "should be present in the PSM file." - ) # Match with file extension if not in resolved_path yet - if not re.match(".mgf$|.mzml$", resolved_path, flags=re.IGNORECASE): + if not _is_minitdf(resolved_path) and not re.match( + r"\.mgf$|\.mzml$|\.d$", resolved_path, flags=re.IGNORECASE + ): for filename in glob(resolved_path + "*"): - if re.match(r".*(\.mgf$|\.mzml$)", filename, flags=re.IGNORECASE): + if re.match(r".*(\.mgf$|\.mzml$|\.d)", filename, flags=re.IGNORECASE): resolved_path = filename break else: raise MS2RescoreConfigurationError( - "Resolved spectrum filename does not contain a supported file " - "extension (mgf or mzml) and could not find any matching existing " + f"Resolved spectrum filename ('{resolved_path}') does not contain a supported " + "file extension (mzML, MGF, or .d) and could not find any matching existing " "files." ) return Path(resolved_path) + + +def _is_minitdf(spectrum_file: str) -> bool: + """ + Check if the spectrum file is a Bruker miniTDF folder. + + A Bruker miniTDF folder has no fixed name, but contains files matching the patterns + ``*ms2spectrum.bin`` and ``*ms2spectrum.parquet``. + """ + files = set(Path(spectrum_file).glob("*ms2spectrum.bin")) + files.update(Path(spectrum_file).glob("*ms2spectrum.parquet")) + return len(files) >= 2 diff --git a/pyproject.toml b/pyproject.toml index 8f135149..694fa068 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,24 +32,24 @@ classifiers = [ dynamic = ["version"] requires-python = ">=3.8" dependencies = [ - "numpy>=1.16.0; python_version != '3.11'", - "numpy==1.24.3; python_version == '3.11'", # Incompatibility with sklearn, pygam, and TF... - "pandas>=1.0", - "rich>=12", - "pyteomics>=4.1.0", - "lxml>=4.5", - "ms2pip>=4.0.0-dev4", - "click>=7", "cascade-config>=0.4.0", + "click>=7", + "customtkinter>=5,<6", "deeplc>=2.2", "deeplcretrainer>=0.2", - "tomli>=2; python_version < '3.11'", - "psm_utils>=0.4", - "customtkinter>=5,<6", - "mokapot>=0.9", - "pydantic>=1.8.2,<2", # Fix compatibility with v2 in psm_utils + "im2deep>=0.1.3", "jinja2>=3", + "lxml>=4.5", + "mokapot>=0.9", + "ms2pip>=4.0.0-dev10", + "ms2rescore_rs", + "numpy>=1.16.0", + "pandas>=1.0", "plotly>=5", + "psm_utils>=0.9", + "pyteomics>=4.7.2", + "rich>=12", + "tomli>=2; python_version < '3.11'", ] [project.optional-dependencies] @@ -78,6 +78,7 @@ CompOmics = "https://www.compomics.com" ms2rescore = "ms2rescore.__main__:main" ms2rescore-gui = "ms2rescore.gui.__main__:main" ms2rescore-report = "ms2rescore.report.__main__:main" +tims2rescore = "ms2rescore.__main__:main_tims" [build-system] requires = ["flit_core >=3.2,<4"] diff --git a/tests/test_data/test.mgf b/tests/test_data/test.mgf new file mode 100644 index 00000000..e4899c08 --- /dev/null +++ b/tests/test_data/test.mgf @@ -0,0 +1,13 @@ +BEGIN IONS +TITLE=peptide: peptide1 +CHARGE=2+ +PEPMASS=475.137295 +ION_MOBILITY=42.42 +RTINSECONDS=51.2 +72.04439 100 +148.06043 600 +232.07504 300 +263.08737 400 +347.10198 500 +423.11802 200 +END IONS diff --git a/tests/test_parse_spectra.py b/tests/test_parse_spectra.py new file mode 100644 index 00000000..4dffc9dc --- /dev/null +++ b/tests/test_parse_spectra.py @@ -0,0 +1,23 @@ +import pytest +from psm_utils import PSM, PSMList + +from ms2rescore.parse_spectra import get_missing_values + + +def test_get_missing_values(): + psm_list = PSMList( + psm_list=[ + PSM(peptidoform="PEPTIDEK/2", spectrum_id="peptide1"), + ] + ) + get_missing_values( + psm_list, + config={ + "spectrum_path": "tests/test_data/test.mgf", + "spectrum_id_pattern": "peptide: (.*)", + }, + rt_required=True, + im_required=True, + ) + assert psm_list[0].retention_time == pytest.approx(0.853, 0.001) + assert psm_list[0].ion_mobility == pytest.approx(42.42, 0.01)