diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index c03b1667..7172c303 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -29,7 +29,7 @@ jobs:
- name: Test built package
run: |
- pip install dist/ms2rescore-*.whl
+ pip install --only-binary :all: dist/ms2rescore-*.whl
# pytest
ms2rescore --help
@@ -54,7 +54,7 @@ jobs:
- name: Install package and dependencies
run: |
python -m pip install --upgrade pip
- pip install . pyinstaller
+ pip install --only-binary :all: . pyinstaller
- name: Install Inno Setup
uses: crazy-max/ghaction-chocolatey@v3
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 339aade3..b2643eec 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -35,11 +35,12 @@ jobs:
- name: Build and install ms2rescore package
run: |
- pip install .[dev]
+ pip install --only-binary :all: .[dev]
+
+ - name: Test with pytest
+ run: |
+ pytest
- # - name: Test with pytest
- # run: |
- # pytest
- name: Test installation
run: |
ms2rescore --help
diff --git a/Dockerfile b/Dockerfile
index 474f6f5d..847f6a4a 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,6 @@
-FROM ubuntu:focal
+FROM python:3.11
+
+# ARG DEBIAN_FRONTEND=noninteractive
LABEL name="ms2rescore"
@@ -11,8 +13,7 @@ ADD MANIFEST.in /ms2rescore/MANIFEST.in
ADD ms2rescore /ms2rescore/ms2rescore
RUN apt-get update \
- && apt-get install --no-install-recommends -y python3-pip procps libglib2.0-0 libsm6 libxrender1 libxext6 \
- && rm -rf /var/lib/apt/lists/* \
- && pip3 install ms2rescore/
+ && apt install -y procps \
+ && pip install /ms2rescore --only-binary :all:
ENTRYPOINT [""]
diff --git a/docs/source/config_schema.md b/docs/source/config_schema.md
index 2b523aa5..f145421c 100644
--- a/docs/source/config_schema.md
+++ b/docs/source/config_schema.md
@@ -48,6 +48,8 @@
- *string*
- *null*
- **`lower_score_is_better`** *(boolean)*: Bool indicating if lower score is better. Default: `false`.
+ - **`max_psm_rank_input`** *(number)*: Maximum rank of PSMs to use as input for rescoring. Minimum: `1`. Default: `10`.
+ - **`max_psm_rank_output`** *(number)*: Maximum rank of PSMs to return after rescoring, before final FDR calculation. Minimum: `1`. Default: `1`.
- **`modification_mapping`** *(object)*: Mapping of modification labels to each replacement label. Default: `{}`.
- **`fixed_modifications`** *(object)*: Mapping of amino acids with fixed modifications to the modification name. Can contain additional properties. Default: `{}`.
- **`processes`** *(number)*: Number of parallel processes to use; -1 for all available. Minimum: `-1`. Default: `-1`.
@@ -57,6 +59,7 @@
- *string*
- *null*
- **`write_report`** *(boolean)*: Write an HTML report with various QC metrics and charts. Default: `false`.
+ - **`profile`** *(boolean)*: Write a txt report using cProfile for profiling. Default: `false`.
## Definitions
- **`feature_generator`** *(object)*: Feature generator configuration. Can contain additional properties.
@@ -76,6 +79,7 @@
- **`reference_dataset`** *(string)*: Path to Ionmob reference dataset file. Default: `"Meier_unimod.parquet"`.
- **`tokenizer`** *(string)*: Path to tokenizer json file. Default: `"tokenizer.json"`.
- **`mokapot`** *(object)*: Mokapot rescoring engine configuration. Additional properties are passed to the Mokapot brew function. Can contain additional properties. Refer to *[#/definitions/rescoring_engine](#definitions/rescoring_engine)*.
+ - **`train_fdr`** *(number)*: FDR threshold for training Mokapot. Minimum: `0`. Maximum: `1`. Default: `0.01`.
- **`write_weights`** *(boolean)*: Write Mokapot weights to a text file. Default: `false`.
- **`write_txt`** *(boolean)*: Write Mokapot results to a text file. Default: `false`.
- **`write_flashlfq`** *(boolean)*: Write Mokapot results to a FlashLFQ-compatible file. Default: `false`.
diff --git a/docs/source/userguide/configuration.rst b/docs/source/userguide/configuration.rst
index 52036aa7..e22b3ec1 100644
--- a/docs/source/userguide/configuration.rst
+++ b/docs/source/userguide/configuration.rst
@@ -123,7 +123,7 @@ be configured separately. For instance:
.. code-block:: json
"fixed_modifications": {
- "C": "U:Carbamidomethyl"
+ "U:Carbamidomethyl": ["C"]
}
.. tab:: TOML
@@ -131,7 +131,7 @@ be configured separately. For instance:
.. code-block:: toml
[ms2rescore.fixed_modifications]
- "Carbamidomethyl" = ["C"]
+ "U:Carbamidomethyl" = ["C"]
.. tab:: GUI
@@ -140,6 +140,28 @@ be configured separately. For instance:
:alt: fixed modifications configuration in GUI
+Fixed terminal modifications can be added by using the special labels ``N-term`` and ``C-term``.
+For example, to additionally add TMT6plex to the N-terminus and lysine residues, the following
+configuration can be used:
+
+.. tab:: JSON
+
+ .. code-block:: json
+
+ "fixed_modifications": {
+ "U:Carbamidomethyl": ["C"],
+ "U:TMT6plex": ["N-term", "K"]
+ }
+
+.. tab:: TOML
+
+ .. code-block:: toml
+
+ [ms2rescore.fixed_modifications]
+ "U:Carbamidomethyl" = ["C"]
+ "U:TMT6plex" = ["N-term", "K"]
+
+
.. caution::
Most search engines DO return fixed modifications as part of the modified peptide sequences.
In these cases, they must NOT be added to the ``fixed_modifications`` configuration.
@@ -218,6 +240,65 @@ expression pattern that extracts the decoy status from the protein name:
decoy_pattern = "DECOY_"
+Multi-rank rescoring
+====================
+
+Some search engines, such as MaxQuant, report multiple candidate PSMs for the same spectrum.
+MS²Rescore can rescore multiple candidate PSMs per spectrum. This allows for lower-ranking
+candidate PSMs to become the top-ranked PSM after rescoring. This behavior can be controlled with
+the ``max_psm_rank_input`` option.
+
+To ensure a correct FDR control after rescoring, MS²Rescore filters out lower-ranking PSMs before
+final FDR calculation and writing the output files. To allow for lower-ranking PSMs to be included
+in the final output - for instance, to consider chimeric spectra - the ``max_psm_rank_output``
+option can be used.
+
+For example, to rescore the top 5 PSMs per spectrum and output the best PSM after rescoring,
+the following configuration can be used:
+
+.. tab:: JSON
+
+ .. code-block:: json
+
+ "max_psm_rank_input": 5
+ "max_psm_rank_output": 1
+
+.. tab:: TOML
+
+ .. code-block:: toml
+
+ max_psm_rank_input = 5
+ max_psm_rank_output = 1
+
+
+Configuring rescoring engines
+=============================
+
+MS²Rescore supports multiple rescoring engines, such as Mokapot and Percolator. The rescoring
+engine can be selected and configured with the ``rescoring_engine`` option. For example, to use
+Mokapot with a custom train_fdr of 0.1%, the following configuration can be used:
+
+.. tab:: JSON
+
+ .. code-block:: json
+
+ "rescoring_engine": {
+ "mokapot": {
+ "train_fdr": 0.001
+ }
+
+.. tab:: TOML
+
+ .. code-block:: toml
+
+ [ms2rescore.rescoring_engine.mokapot]
+ train_fdr = 0.001
+
+
+All options for the rescoring engines can be found in the :ref:`ms2rescore.rescoring_engines`
+section.
+
+
All configuration options
=========================
diff --git a/docs/source/userguide/input-files.rst b/docs/source/userguide/input-files.rst
index 08129b7f..b556b520 100644
--- a/docs/source/userguide/input-files.rst
+++ b/docs/source/userguide/input-files.rst
@@ -5,16 +5,13 @@ Input files
PSM file(s)
===========
-The peptide-spectrum match (PSM) file is generally the output from a proteomics search engine.
-This file serves as the main input to MS²Rescore. One or multiple PSM files can be provided at
-once. Note that merging PSMs from different MS runs could have an impact on the correctness of
-the FDR control.
+The **peptide-spectrum match (PSM) file** is generally the output from a proteomics search engine.
+This file serves as the main input to MS²Rescore.
-Various PSM file types are supported. The type can be specified with the ``psm_file_type`` option.
-Check the list of :py:mod:`psm_utils` tags in the
-:external+psm_utils:ref:`supported file formats ` section. Depending on the
-file extension, the file type can also be inferred from the file name. In that case,
-``psm_file_type`` option can be set to ``infer``.
+The PSM file should contain **all putative identifications** made by the search engine, including
+both target and decoy PSMs. Ensure that the search engine was configured to include decoy entries
+in the search database and was operated with **target-decoy competition** enabled (i.e.,
+considering both target and decoy sequences simultaneously during the search).
.. attention::
As a general rule, MS²Rescore always needs access to **all target and decoy PSMs, without any
@@ -22,6 +19,17 @@ file extension, the file type can also be inferred from the file name. In that c
set to 100%.
+One or multiple PSM files can be provided at once. Note that merging PSMs from different MS runs
+could have an impact on the correctness of the FDR control. Combining multiple PSM files should
+generally only be done for LC-fractionated mass spectrometry runs.
+
+Various PSM file types are supported. The type can be specified with the ``psm_file_type`` option.
+Check the list of :py:mod:`psm_utils` tags in the
+:external+psm_utils:ref:`supported file formats ` section. Depending on the
+file extension, the file type can also be inferred from the file name. In that case,
+``psm_file_type`` option can be set to ``infer``.
+
+
Spectrum file(s)
================
diff --git a/docs/source/userguide/output-files.rst b/docs/source/userguide/output-files.rst
index ee1a78f8..89c0363b 100644
--- a/docs/source/userguide/output-files.rst
+++ b/docs/source/userguide/output-files.rst
@@ -52,8 +52,8 @@ Rescoring engine files:
| ``..weights.txt`` | Feature weights, showing feature usage in the rescoring run |
+-------------------------------------------------------------+-------------------------------------------------------------+
-If no rescoring engine is selected (or if Percolator was selected), the following files will also
-be written:
+If no rescoring engine is selected, if Percolator was selected, or in DEBUG mode, the following
+files will also be written:
+-------------------------------------------------------------+-----------------------------------------------------------+
| File | Description |
diff --git a/docs/source/userguide/tims2Rescore.rst b/docs/source/userguide/tims2Rescore.rst
new file mode 100644
index 00000000..f9b9096f
--- /dev/null
+++ b/docs/source/userguide/tims2Rescore.rst
@@ -0,0 +1,61 @@
+.. _timsrescore:
+
+TIMS²Rescore User Guide
+=======================
+
+Introduction
+------------
+
+The `TIMS²Rescore` tool is a DDA-PASEF adapted version of `ms2rescore` that allows users to perform rescoring of peptide-spectrum matches (PSMs) acquired on Bruker instruments. This guide provides an overview of how to use `timsrescore` in `ms2rescore` effectively.
+
+Installation
+------------
+
+Before using `timsrescore`, ensure that you have `ms2rescore` installed on your system. You can install `ms2rescore` using the following command:
+
+.. code-block:: bash
+
+ pip install ms2rescore
+
+Usage
+-----
+
+To use `timsrescore`, follow these steps:
+
+1. Prepare your input files:
+ - Ensure that you have the necessary input files, including the PSM file spectrum files
+ - Make sure that the PSM file format from a supported search engine or a standard format like .mzid(:external+psm_utils:ref:`supported file formats `).
+ - Spectrum files can directly be given as .d or minitdf files from Bruker instruments or first converted to .mzML format.
+
+2. Run `timsrescore`:
+ - Open a terminal or command prompt.
+ - Navigate to the directory where your input files are located.
+ - Execute the following command:
+
+ .. code-block:: bash
+
+ timsrescore -p -s -o
+
+ Replace ``, ``, and `` with the actual paths to your input and output files.
+ _NOTE_ By default timsTOF specific models will be used for predictions. Optionally you can further configure settings through a configuration file. For more information on configuring `timsrescore`, refer to the :doc:`configuration` tab in the user guide.
+
+3. Review the results:
+ - Once the `timsrescore` process completes, you will find the rescoring results in the specified output file or if not specified in the same directory as the input files
+ - If you want a detailed overview of the performance, you can either give the set `write_report` to `True` in the configuration file, use the `--write_report` option in the command line or run the following command:
+
+ .. code-block:: bash
+
+ ms2rescore-report
+
+ Replace `` with the actual output prefix of the result files to the output file.
+
+Additional Options
+------------------
+
+`ms2rescore` provides additional options to customize the `timsrescore` process. You can explore these options by running the following command:
+
+.. code-block:: bash
+
+ timsrescore --help
+
+
diff --git a/ms2rescore/__init__.py b/ms2rescore/__init__.py
index 509fc99e..606a1c8a 100644
--- a/ms2rescore/__init__.py
+++ b/ms2rescore/__init__.py
@@ -1,6 +1,6 @@
"""MS²Rescore: Sensitive PSM rescoring with predicted MS² peak intensities and RTs."""
-__version__ = "3.0.2"
+__version__ = "3.1.0-dev9"
from warnings import filterwarnings
diff --git a/ms2rescore/__main__.py b/ms2rescore/__main__.py
index 4cac9122..70e384c2 100644
--- a/ms2rescore/__main__.py
+++ b/ms2rescore/__main__.py
@@ -1,6 +1,9 @@
"""MS²Rescore: Sensitive PSM rescoring with predicted MS² peak intensities and RTs."""
import argparse
+import cProfile
+import importlib.resources
+import json
import logging
import sys
from pathlib import Path
@@ -10,7 +13,7 @@
from rich.logging import RichHandler
from rich.text import Text
-from ms2rescore import __version__
+from ms2rescore import __version__, package_data
from ms2rescore.config_parser import parse_configurations
from ms2rescore.core import rescore
from ms2rescore.exceptions import MS2RescoreConfigurationError
@@ -33,19 +36,26 @@
CONSOLE = Console(record=True)
-def _print_credits():
+def _print_credits(tims=False):
"""Print software credits to terminal."""
text = Text()
text.append("\n")
- text.append("MS²Rescore", style="bold link https://github.com/compomics/ms2rescore")
+ if tims:
+ text.append("TIMS²Rescore", style="bold link https://github.com/compomics/ms2rescore")
+ else:
+ text.append("MS²Rescore", style="bold link https://github.com/compomics/ms2rescore")
text.append(f" (v{__version__})\n", style="bold")
+ if tims:
+ text.append("MS²Rescore tuned for Bruker timsTOF instruments.\n", style="italic")
text.append("Developed at CompOmics, VIB / Ghent University, Belgium.\n")
text.append("Please cite: ")
text.append(
- "Declercq et al. MCP (2022)", style="link https://doi.org/10.1016/j.mcpro.2022.100266"
+ "Buur & Declercq et al. JPR (2024)",
+ style="link https://doi.org/10.1021/acs.jproteome.3c00785",
)
text.append("\n")
- text.stylize("cyan")
+ if tims:
+ text.stylize("#006cb5")
CONSOLE.print(text)
@@ -130,6 +140,21 @@ def _argument_parser() -> argparse.ArgumentParser:
dest="fasta_file",
help="path to FASTA file",
)
+ parser.add_argument(
+ "--write-report",
+ # metavar="BOOL",
+ action="store_true",
+ dest="write_report",
+ help="boolean to enable profiling with cProfile",
+ )
+ parser.add_argument(
+ "--profile",
+ # metavar="BOOL",
+ action="store_true",
+ # type=bool,
+ # dest="profile",
+ help="boolean to enable profiling with cProfile",
+ )
return parser
@@ -152,18 +177,42 @@ def _setup_logging(passed_level: str, log_file: Union[str, Path]):
)
-def main():
+def profile(fnc, filepath):
+ """A decorator that uses cProfile to profile a function"""
+
+ def inner(*args, **kwargs):
+ with cProfile.Profile() as profiler:
+ return_value = fnc(*args, **kwargs)
+ profiler.dump_stats(filepath + ".profile.prof")
+ return return_value
+
+ return inner
+
+
+def main_tims():
+ """Run MS²Rescore command-line interface in TIMS²Rescore mode."""
+ main(tims=True)
+
+
+def main(tims=False):
"""Run MS²Rescore command-line interface."""
- _print_credits()
+ _print_credits(tims)
# Parse CLI arguments and configuration file
parser = _argument_parser()
cli_args = parser.parse_args()
+
+ configurations = []
+ if cli_args.config_file:
+ configurations.append(cli_args.config_file)
+ if tims:
+ configurations.append(
+ json.load(importlib.resources.open_text(package_data, "config_default_tims.json"))
+ )
+ configurations.append(cli_args)
+
try:
- if cli_args.config_file:
- config = parse_configurations([cli_args.config_file, cli_args])
- else:
- config = parse_configurations(cli_args)
+ config = parse_configurations(configurations)
except MS2RescoreConfigurationError as e:
LOGGER.critical(e)
sys.exit(1)
@@ -175,7 +224,11 @@ def main():
# Run MS²Rescore
try:
- rescore(configuration=config)
+ if cli_args.profile:
+ profiled_rescore = profile(rescore, config["ms2rescore"]["output_path"])
+ profiled_rescore(configuration=config)
+ else:
+ rescore(configuration=config)
except Exception as e:
LOGGER.exception(e)
sys.exit(1)
diff --git a/ms2rescore/core.py b/ms2rescore/core.py
index f777d620..170f1038 100644
--- a/ms2rescore/core.py
+++ b/ms2rescore/core.py
@@ -3,15 +3,18 @@
from multiprocessing import cpu_count
from typing import Dict, Optional
+import numpy as np
import psm_utils.io
+from mokapot.dataset import LinearPsmDataset
from psm_utils import PSMList
+from ms2rescore import exceptions
from ms2rescore.feature_generators import FEATURE_GENERATORS
from ms2rescore.parse_psms import parse_psms
from ms2rescore.parse_spectra import get_missing_values
from ms2rescore.report import generate
from ms2rescore.rescoring_engines import mokapot, percolator
-from ms2rescore import exceptions
+from ms2rescore.rescoring_engines.mokapot import add_peptide_confidence, add_psm_confidence
logger = logging.getLogger(__name__)
@@ -44,7 +47,7 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
psm_list = parse_psms(config, psm_list)
# Log #PSMs identified before rescoring
- id_psms_before = _log_id_psms_before(psm_list)
+ id_psms_before = _log_id_psms_before(psm_list, max_rank=config["max_psm_rank_output"])
# Define feature names; get existing feature names from PSM file
feature_names = dict()
@@ -58,12 +61,8 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
f"PSMs already contain the following rescoring features: {psm_list_feature_names}"
)
- # TODO: avoid hard coding feature generators in some way
- rt_required = "deeplc" in config["feature_generators"] and None in psm_list["retention_time"]
- im_required = "ionmob" in config["feature_generators"] and None in psm_list["ion_mobility"]
- if rt_required or im_required:
- logger.info("Parsing missing retention time and/or ion mobility values from spectra...")
- get_missing_values(config, psm_list, missing_rt=rt_required, missing_im=im_required)
+ # Add missing precursor info from spectrum file if needed
+ psm_list = _fill_missing_precursor_info(psm_list, config)
# Add rescoring features
for fgen_name, fgen_config in config["feature_generators"].items():
@@ -107,8 +106,8 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
logging.debug(f"Creating USIs for {len(psm_list)} PSMs")
psm_list["spectrum_id"] = [psm.get_usi(as_url=False) for psm in psm_list]
- # If no rescoring engine is specified, write PSMs and features to PIN file
- if not config["rescoring_engine"]:
+ # If no rescoring engine is specified or DEBUG, write PSMs and features to PIN file
+ if not config["rescoring_engine"] or config["log_level"] == "debug":
logger.info(f"Writing added features to PIN file: {output_file_root}.psms.pin")
psm_utils.io.write_file(
psm_list,
@@ -116,35 +115,49 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
filetype="percolator",
feature_names=all_feature_names,
)
+
+ if not config["rescoring_engine"]:
+ logger.info("No rescoring engine specified. Skipping rescoring.")
return None
# Rescore PSMs
- if "percolator" in config["rescoring_engine"]:
- percolator.rescore(
- psm_list,
- output_file_root=output_file_root,
- log_level=config["log_level"],
- processes=config["processes"],
- percolator_kwargs=config["rescoring_engine"]["percolator"],
- )
- elif "mokapot" in config["rescoring_engine"]:
- if "fasta_file" not in config["rescoring_engine"]["mokapot"]:
- config["rescoring_engine"]["mokapot"]["fasta_file"] = config["fasta_file"]
- if "protein_kwargs" in config["rescoring_engine"]["mokapot"]:
- protein_kwargs = config["rescoring_engine"]["mokapot"].pop("protein_kwargs")
- else:
- protein_kwargs = dict()
-
- mokapot.rescore(
- psm_list,
- output_file_root=output_file_root,
- protein_kwargs=protein_kwargs,
- **config["rescoring_engine"]["mokapot"],
- )
- else:
- logger.info("No known rescoring engine specified. Skipping rescoring.")
+ try:
+ if "percolator" in config["rescoring_engine"]:
+ percolator.rescore(
+ psm_list,
+ output_file_root=output_file_root,
+ log_level=config["log_level"],
+ processes=config["processes"],
+ percolator_kwargs=config["rescoring_engine"]["percolator"],
+ )
+ elif "mokapot" in config["rescoring_engine"]:
+ if "fasta_file" not in config["rescoring_engine"]["mokapot"]:
+ config["rescoring_engine"]["mokapot"]["fasta_file"] = config["fasta_file"]
+ if "protein_kwargs" in config["rescoring_engine"]["mokapot"]:
+ protein_kwargs = config["rescoring_engine"]["mokapot"].pop("protein_kwargs")
+ else:
+ protein_kwargs = dict()
+
+ mokapot.rescore(
+ psm_list,
+ output_file_root=output_file_root,
+ protein_kwargs=protein_kwargs,
+ **config["rescoring_engine"]["mokapot"],
+ )
+ except exceptions.RescoringError as e:
+ # Write output
+ logger.info(f"Writing intermediary output to {output_file_root}.psms.tsv...")
+ psm_utils.io.write_file(psm_list, output_file_root + ".psms.tsv", filetype="tsv")
- _log_id_psms_after(psm_list, id_psms_before)
+ # Reraise exception
+ raise e
+
+ # Post-rescoring processing
+ if all(psm_list["pep"] == 1.0):
+ psm_list = _fix_constant_pep(psm_list)
+ psm_list = _filter_by_rank(psm_list, config["max_psm_rank_output"], False)
+ psm_list = _calculate_confidence(psm_list)
+ _ = _log_id_psms_after(psm_list, id_psms_before, max_rank=config["max_psm_rank_output"])
# Write output
logger.info(f"Writing output to {output_file_root}.psms.tsv...")
@@ -157,7 +170,60 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
output_file_root, psm_list=psm_list, feature_names=feature_names, use_txt_log=True
)
except exceptions.ReportGenerationError as e:
- logger.error(e)
+ logger.exception(e)
+
+
+def _fill_missing_precursor_info(psm_list: PSMList, config: Dict) -> PSMList:
+ """Fill missing precursor info from spectrum file if needed."""
+ # Check if required
+ # TODO: avoid hard coding feature generators in some way
+ rt_required = ("deeplc" in config["feature_generators"]) and any(
+ v is None or v == 0 or np.isnan(v) for v in psm_list["retention_time"]
+ )
+ im_required = (
+ "ionmob" in config["feature_generators"] or "im2deep" in config["feature_generators"]
+ ) and any(v is None or v == 0 or np.isnan(v) for v in psm_list["ion_mobility"])
+ logger.debug(f"RT required: {rt_required}, IM required: {im_required}")
+
+ # Add missing values
+ if rt_required or im_required:
+ logger.info("Parsing missing retention time and/or ion mobility values from spectra...")
+ get_missing_values(psm_list, config, rt_required=rt_required, im_required=im_required)
+
+ # Check if values are now present
+ for value_name, required in [("retention_time", rt_required), ("ion_mobility", im_required)]:
+ if required and (
+ 0.0 in psm_list[value_name]
+ or None in psm_list[value_name]
+ or np.isnan(psm_list[value_name]).any()
+ ):
+ if all(v is None or v == 0.0 or np.isnan(v) for v in psm_list[value_name]):
+ raise exceptions.MissingValuesError(
+ f"Could not find any '{value_name}' values in PSM or spectrum files. Disable "
+ f"feature generators that require '{value_name}' or ensure that the values are "
+ "present in the input files."
+ )
+ else:
+ missing_value_psms = psm_list[
+ [v is None or np.isnan(v) for v in psm_list[value_name]]
+ ]
+ logger.warning(
+ f"Found {len(missing_value_psms)} PSMs with missing '{value_name}' values. "
+ "These PSMs will be removed."
+ )
+ psm_list = psm_list[
+ [v is not None and not np.isnan(v) for v in psm_list[value_name]]
+ ]
+
+ return psm_list
+
+
+def _filter_by_rank(psm_list: PSMList, max_rank: int, lower_score_better: bool) -> PSMList:
+ """Filter PSMs by rank."""
+ psm_list.set_ranks(lower_score_better=lower_score_better)
+ rank_filter = psm_list["rank"] <= max_rank
+ logger.info(f"Removed {sum(~rank_filter)} PSMs with rank >= {max_rank}.")
+ return psm_list[rank_filter]
def _write_feature_names(feature_names, output_file_root):
@@ -169,25 +235,79 @@ def _write_feature_names(feature_names, output_file_root):
f.write(f"{fgen}\t{feature}\n")
-def _log_id_psms_before(psm_list):
+def _log_id_psms_before(psm_list: PSMList, fdr: float = 0.01, max_rank: int = 1) -> int:
"""Log #PSMs identified before rescoring."""
id_psms_before = (
- (psm_list["qvalue"] <= 0.01) & (psm_list["is_decoy"] == False) # noqa: E712
+ (psm_list["qvalue"] <= 0.01) & (psm_list["rank"] <= max_rank) & (~psm_list["is_decoy"])
).sum()
- logger.info("Found %i identified PSMs at 1%% FDR before rescoring.", id_psms_before)
+ logger.info(
+ f"Found {id_psms_before} identified PSMs with rank <= {max_rank} at {fdr} FDR before "
+ "rescoring."
+ )
return id_psms_before
-def _log_id_psms_after(psm_list, id_psms_before):
+def _log_id_psms_after(
+ psm_list: PSMList, id_psms_before: int, fdr: float = 0.01, max_rank: int = 1
+) -> int:
"""Log #PSMs identified after rescoring."""
id_psms_after = (
- (psm_list["qvalue"] <= 0.01) & (psm_list["is_decoy"] == False) # noqa: E712
+ (psm_list["qvalue"] <= 0.01) & (psm_list["rank"] <= max_rank) & (~psm_list["is_decoy"])
).sum()
diff = id_psms_after - id_psms_before
diff_perc = diff / id_psms_before if id_psms_before > 0 else None
diff_numbers = f"{diff} ({diff_perc:.2%})" if diff_perc is not None else str(diff)
diff_word = "more" if diff > 0 else "less"
- logger.info(f"Identified {diff_numbers} {diff_word} PSMs at 1% FDR after rescoring.")
+ logger.info(
+ f"Identified {diff_numbers} {diff_word} PSMs with rank <= {max_rank} at {fdr} FDR after "
+ "rescoring."
+ )
return id_psms_after
+
+
+def _fix_constant_pep(psm_list: PSMList) -> PSMList:
+ """Workaround for broken PEP calculation if best PSM is decoy."""
+ logger.warning(
+ "Attempting to fix constant PEP values by removing decoy PSMs that score higher than the "
+ "best target PSM."
+ )
+ max_target_score = psm_list["score"][~psm_list["is_decoy"]].max()
+ higher_scoring_decoys = psm_list["is_decoy"] & (psm_list["score"] > max_target_score)
+
+ if not higher_scoring_decoys.any():
+ logger.warning("No decoys scoring higher than the best target found. Skipping fix.")
+ else:
+ psm_list = psm_list[~higher_scoring_decoys]
+ logger.warning(f"Removed {higher_scoring_decoys.sum()} decoy PSMs.")
+
+ return psm_list
+
+
+def _calculate_confidence(psm_list: PSMList) -> PSMList:
+ """
+ Calculate scores, q-values, and PEPs for PSMs and peptides and add them to PSMList.
+ """
+ # Minimal conversion to LinearPsmDataset
+ psm_df = psm_list.to_dataframe()
+ psm_df = psm_df.reset_index(drop=True).reset_index()
+ psm_df["peptide"] = (
+ psm_df["peptidoform"].astype(str).str.replace(r"(/\d+$)", "", n=1, regex=True)
+ )
+ psm_df["is_target"] = ~psm_df["is_decoy"]
+ lin_psm_data = LinearPsmDataset(
+ psms=psm_df[["index", "peptide", "is_target"]],
+ target_column="is_target",
+ spectrum_columns="index", # Use artificial index to allow multi-rank rescoring
+ peptide_column="peptide",
+ )
+
+ # Recalculate confidence
+ new_confidence = lin_psm_data.assign_confidence(scores=psm_list["score"])
+
+ # Add new confidence estimations to PSMList
+ add_psm_confidence(psm_list, new_confidence)
+ add_peptide_confidence(psm_list, new_confidence)
+
+ return psm_list
diff --git a/ms2rescore/exceptions.py b/ms2rescore/exceptions.py
index f2b06323..ba5492a2 100644
--- a/ms2rescore/exceptions.py
+++ b/ms2rescore/exceptions.py
@@ -25,7 +25,19 @@ class ModificationParsingError(IDFileParsingError):
pass
+class MissingValuesError(MS2RescoreError):
+ """Missing values in PSMs and/or spectra."""
+
+ pass
+
+
class ReportGenerationError(MS2RescoreError):
"""Error while generating report."""
pass
+
+
+class RescoringError(MS2RescoreError):
+ """Error while rescoring PSMs."""
+
+ pass
diff --git a/ms2rescore/feature_generators/ms2pip.py b/ms2rescore/feature_generators/ms2pip.py
index 89603237..3882b3fc 100644
--- a/ms2rescore/feature_generators/ms2pip.py
+++ b/ms2rescore/feature_generators/ms2pip.py
@@ -53,6 +53,7 @@ def __init__(
ms2_tolerance: float = 0.02,
spectrum_path: Optional[str] = None,
spectrum_id_pattern: str = "(.*)",
+ model_dir: Optional[str] = None,
processes: 1,
**kwargs,
) -> None:
@@ -71,6 +72,8 @@ def __init__(
spectrum_id_pattern : str, optional
Regular expression pattern to extract spectrum ID from spectrum file. Defaults to
:py:const:`.*`.
+ model_dir
+ Directory containing MS²PIP models. Defaults to :py:const:`None` (use MS²PIP default).
processes : int, optional
Number of processes to use. Defaults to 1.
@@ -85,6 +88,7 @@ def __init__(
self.ms2_tolerance = ms2_tolerance
self.spectrum_path = spectrum_path
self.spectrum_id_pattern = spectrum_id_pattern
+ self.model_dir = model_dir
self.processes = processes
@property
@@ -189,11 +193,12 @@ def add_features(self, psm_list: PSMList) -> None:
try:
ms2pip_results = correlate(
psms=psm_list_run,
- spectrum_file=spectrum_filename,
+ spectrum_file=str(spectrum_filename),
spectrum_id_pattern=self.spectrum_id_pattern,
model=self.model,
ms2_tolerance=self.ms2_tolerance,
compute_correlations=False,
+ model_dir=self.model_dir,
processes=self.processes,
)
except NoMatchingSpectraFound as e:
diff --git a/ms2rescore/package_data/config_default.json b/ms2rescore/package_data/config_default.json
index 71f704b5..4e318da5 100644
--- a/ms2rescore/package_data/config_default.json
+++ b/ms2rescore/package_data/config_default.json
@@ -14,6 +14,7 @@
},
"rescoring_engine": {
"mokapot": {
+ "train_fdr": 0.01,
"write_weights": true,
"write_txt": true,
"write_flashlfq": true
@@ -30,11 +31,13 @@
"psm_id_pattern": null,
"spectrum_id_pattern": null,
"lower_score_is_better": false,
+ "max_psm_rank_input": 10,
+ "max_psm_rank_output": 1,
"modification_mapping": {},
"fixed_modifications": {},
"processes": -1,
"rename_to_usi": false,
"fasta_file": null,
- "write_report": true
+ "write_report": false
}
}
diff --git a/ms2rescore/package_data/config_default_tims.json b/ms2rescore/package_data/config_default_tims.json
new file mode 100644
index 00000000..2a77adf1
--- /dev/null
+++ b/ms2rescore/package_data/config_default_tims.json
@@ -0,0 +1,25 @@
+{
+ "$schema": "./config_schema.json",
+ "ms2rescore": {
+ "feature_generators": {
+ "basic": {},
+ "ms2pip": {
+ "model": "timsTOF",
+ "ms2_tolerance": 0.02
+ },
+ "deeplc": {
+ "deeplc_retrain": false
+ },
+ "im2deep": {},
+ "maxquant": {}
+ },
+ "rescoring_engine": {
+ "mokapot": {
+ "write_weights": true,
+ "write_txt": true,
+ "write_flashlfq": true
+ }
+ },
+ "psm_file": null
+ }
+}
diff --git a/ms2rescore/package_data/config_schema.json b/ms2rescore/package_data/config_schema.json
index a97c9a59..75b9284e 100644
--- a/ms2rescore/package_data/config_schema.json
+++ b/ms2rescore/package_data/config_schema.json
@@ -65,7 +65,11 @@
},
"psm_file": {
"description": "Path to file with peptide-spectrum matches.",
- "oneOf": [{ "type": "string" }, { "type": "null" }, { "type": "array", "items": { "type": "string" } }]
+ "oneOf": [
+ { "type": "string" },
+ { "type": "null" },
+ { "type": "array", "items": { "type": "string" } }
+ ]
},
"psm_file_type": {
"description": "PSM file type. By default inferred from file extension.",
@@ -112,6 +116,18 @@
"type": "boolean",
"default": false
},
+ "max_psm_rank_input": {
+ "description": "Maximum rank of PSMs to use as input for rescoring",
+ "type": "number",
+ "default": 10,
+ "minimum": 1
+ },
+ "max_psm_rank_output": {
+ "description": "Maximum rank of PSMs to return after rescoring, before final FDR calculation",
+ "type": "number",
+ "default": 1,
+ "minimum": 1
+ },
"modification_mapping": {
"description": "Mapping of modification labels to each replacement label.",
"type": "object",
@@ -142,6 +158,11 @@
"description": "Write an HTML report with various QC metrics and charts",
"type": "boolean",
"default": false
+ },
+ "profile": {
+ "description": "Write a txt report using cProfile for profiling",
+ "type": "boolean",
+ "default": false
}
}
}
@@ -230,6 +251,13 @@
"type": "object",
"additionalProperties": true,
"properties": {
+ "train_fdr": {
+ "description": "FDR threshold for training Mokapot",
+ "type": "number",
+ "minimum": 0,
+ "maximum": 1,
+ "default": 0.01
+ },
"write_weights": {
"description": "Write Mokapot weights to a text file",
"type": "boolean",
diff --git a/ms2rescore/parse_psms.py b/ms2rescore/parse_psms.py
index b30539f3..3514210e 100644
--- a/ms2rescore/parse_psms.py
+++ b/ms2rescore/parse_psms.py
@@ -1,8 +1,8 @@
import logging
import re
-from itertools import chain
-from typing import Dict, Union
+from typing import Dict, Optional, Union
+import numpy as np
import psm_utils.io
from psm_utils import PSMList
@@ -24,10 +24,30 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList:
PSMList object containing PSMs. If None, PSMs will be read from ``psm_file``.
"""
- # Read PSMs, find decoys, calculate q-values
- psm_list = _read_psms(config, psm_list)
- _find_decoys(config, psm_list)
- _calculate_qvalues(config, psm_list)
+ # Read PSMs
+ try:
+ psm_list = _read_psms(config, psm_list)
+ except psm_utils.io.PSMUtilsIOException:
+ raise MS2RescoreConfigurationError(
+ "Error occurred while reading PSMs. Please check the 'psm_file' and "
+ "'psm_file_type' settings. See "
+ "https://ms2rescore.readthedocs.io/en/latest/userguide/input-files/"
+ " for more information."
+ )
+
+ # Filter by PSM rank
+ psm_list.set_ranks(config["lower_score_is_better"])
+ rank_filter = psm_list["rank"] <= config["max_psm_rank_input"]
+ psm_list = psm_list[rank_filter]
+ logger.info(f"Removed {sum(~rank_filter)} PSMs with rank >= {config['max_psm_rank_input']}.")
+
+ # Remove invalid AAs, find decoys, calculate q-values
+ psm_list = _remove_invalid_aa(psm_list)
+ _find_decoys(psm_list, config["id_decoy_pattern"])
+ _calculate_qvalues(psm_list, config["lower_score_is_better"])
+ if config["psm_id_rt_pattern"] or config["psm_id_im_pattern"]:
+ logger.debug("Parsing retention time and/or ion mobility from PSM identifier...")
+ _parse_values_from_spectrum_id(config, psm_list)
# Store scoring values for comparison later
for psm in psm_list:
@@ -60,17 +80,13 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList:
if config["psm_id_pattern"]:
pattern = re.compile(config["psm_id_pattern"])
- logger.debug("Applying `psm_id_pattern`...")
+ logger.debug("Applying 'psm_id_pattern'...")
logger.debug(
- f"Parsing `{psm_list['spectrum_id'][0]}` to `{_match_psm_ids(psm_list['spectrum_id'][0], pattern)}`"
+ f"Parsing '{psm_list[0].spectrum_id}' to '{_match_psm_ids(psm_list[0].spectrum_id, pattern)}'"
)
new_ids = [_match_psm_ids(old_id, pattern) for old_id in psm_list["spectrum_id"]]
psm_list["spectrum_id"] = new_ids
- # TODO: Temporary fix until implemented in psm_utils
- # Ensure that spectrum IDs are strings (Pydantic 2.0 does not coerce int to str)
- psm_list["spectrum_id"] = [str(spec_id) for spec_id in psm_list["spectrum_id"]]
-
return psm_list
@@ -78,49 +94,30 @@ def _read_psms(config, psm_list):
if isinstance(psm_list, PSMList):
return psm_list
else:
- logger.info("Reading PSMs from file...")
- current_file = 1
total_files = len(config["psm_file"])
- valid_psms_list = []
- total_psms = 0
- valid_psms = 0
- for psm_file in config["psm_file"]:
+ psm_list = []
+ for current_file, psm_file in enumerate(config["psm_file"]):
logger.info(
- f"Reading PSMs from PSM file ({current_file}/{total_files}): `{psm_file}`..."
+ f"Reading PSMs from PSM file ({current_file+1}/{total_files}): '{psm_file}'..."
)
- try:
- id_file_psm_list = psm_utils.io.read_file(
+ psm_list.extend(
+ psm_utils.io.read_file(
psm_file,
filetype=config["psm_file_type"],
show_progressbar=True,
**config["psm_reader_kwargs"],
)
- except psm_utils.io.PSMUtilsIOException:
- raise MS2RescoreConfigurationError(
- "Error occurred while reading PSMs. Please check the `psm_file` and "
- "`psm_file_type` settings. See "
- "https://ms2rescore.readthedocs.io/en/latest/userguide/input-files/"
- " for more information."
- )
-
- total_psms += len(id_file_psm_list.psm_list)
- for psm in id_file_psm_list.psm_list:
- if not _has_invalid_aminoacids(psm):
- valid_psms_list.append(psm)
- valid_psms += 1
- current_file += 1
- if total_psms - valid_psms > 0:
- logger.warning(
- f"{total_psms - valid_psms} PSMs with invalid amino acids were removed."
)
- return PSMList(psm_list=valid_psms_list)
+ logger.debug(f"Read {len(psm_list)} PSMs from '{psm_file}'.")
+ return PSMList(psm_list=psm_list)
-def _find_decoys(config, psm_list):
+
+def _find_decoys(psm_list: PSMList, id_decoy_pattern: Optional[str] = None):
"""Find decoys in PSMs, log amount, and raise error if none found."""
logger.debug("Finding decoys...")
- if config["id_decoy_pattern"]:
- psm_list.find_decoys(config["id_decoy_pattern"])
+ if id_decoy_pattern:
+ psm_list.find_decoys(id_decoy_pattern)
n_psms = len(psm_list)
percent_decoys = sum(psm_list["is_decoy"]) / n_psms * 100
@@ -129,18 +126,18 @@ def _find_decoys(config, psm_list):
if not any(psm_list["is_decoy"]):
raise MS2RescoreConfigurationError(
"No decoy PSMs found. Please check if decoys are present in the PSM file and that "
- "the `id_decoy_pattern` option is correct. See "
+ "the 'id_decoy_pattern' option is correct. See "
"https://ms2rescore.readthedocs.io/en/latest/userguide/configuration/#selecting-decoy-psms"
" for more information."
)
-def _calculate_qvalues(config, psm_list):
+def _calculate_qvalues(psm_list: PSMList, lower_score_is_better: bool):
"""Calculate q-values for PSMs if not present."""
# Calculate q-values if not present
if None in psm_list["qvalue"]:
logger.debug("Recalculating q-values...")
- psm_list.calculate_qvalues(reverse=not config["lower_score_is_better"])
+ psm_list.calculate_qvalues(reverse=not lower_score_is_better)
def _match_psm_ids(old_id, regex_pattern):
@@ -150,12 +147,50 @@ def _match_psm_ids(old_id, regex_pattern):
return match[1]
except (TypeError, IndexError):
raise MS2RescoreConfigurationError(
- f"`psm_id_pattern` could not be extracted from PSM spectrum IDs (i.e. {old_id})."
+ f"'psm_id_pattern' could not be extracted from PSM spectrum IDs (i.e. {old_id})."
" Ensure that the regex contains a capturing group?"
)
-def _has_invalid_aminoacids(psm):
- """Check if a PSM contains invalid amino acids."""
+def _parse_values_from_spectrum_id(
+ psm_list: PSMList,
+ psm_id_rt_pattern: Optional[str] = None,
+ psm_id_im_pattern: Optional[str] = None,
+):
+ """Parse retention time and or ion mobility values from the spectrum_id."""
+ for pattern, label, key in zip(
+ [psm_id_rt_pattern, psm_id_im_pattern],
+ ["retention time", "ion mobility"],
+ ["retention_time", "ion_mobility"],
+ ):
+ if pattern:
+ logger.debug(
+ f"Parsing {label} from spectrum_id with regex pattern " f"{psm_id_rt_pattern}"
+ )
+ try:
+ pattern = re.compile(pattern)
+ psm_list[key] = [
+ float(pattern.search(psm.spectrum_id).group(1)) for psm in psm_list
+ ]
+ except AttributeError:
+ raise MS2RescoreConfigurationError(
+ f"Could not parse {label} from spectrum_id with the "
+ f"{pattern} regex pattern. "
+ f"Example spectrum_id: '{psm_list[0].spectrum_id}'\n. "
+ f"Please make sure the {label} key is present in the spectrum_id "
+ "and the value is in a capturing group or disable the relevant feature generator."
+ )
+
+
+def _remove_invalid_aa(psm_list: PSMList) -> PSMList:
+ """Remove PSMs with invalid amino acids."""
+ invalid_psms = np.array(
+ [any(aa in "BJOUXZ" for aa in psm.peptidoform.sequence) for psm in psm_list]
+ )
- return any(aa not in "ACDEFGHIKLMNPQRSTVWY" for aa in psm.peptidoform.sequence)
+ if any(invalid_psms):
+ logger.warning(f"Removed {sum(invalid_psms)} PSMs with invalid amino acids.")
+ return psm_list[~invalid_psms]
+ else:
+ logger.debug("No PSMs with invalid amino acids found.")
+ return psm_list
diff --git a/ms2rescore/parse_spectra.py b/ms2rescore/parse_spectra.py
index 9ed199b9..2b2c1b5f 100644
--- a/ms2rescore/parse_spectra.py
+++ b/ms2rescore/parse_spectra.py
@@ -3,12 +3,9 @@
import logging
import re
from itertools import chain
-from typing import Dict, Tuple
+from ms2rescore_rs import get_precursor_info
from psm_utils import PSMList
-from pyteomics.mgf import MGF
-from pyteomics.mzml import MzML
-from rich.progress import track
from ms2rescore.exceptions import MS2RescoreError
from ms2rescore.utils import infer_spectrum_path
@@ -16,122 +13,40 @@
logger = logging.getLogger(__name__)
-def get_missing_values(config, psm_list, missing_rt=False, missing_im=False):
+def get_missing_values(
+ psm_list: PSMList, config: dict, rt_required: bool = False, im_required: bool = False
+):
"""Get missing RT/IM features from spectrum file."""
- logger.debug("Extracting missing RT/IM values from spectrum file(s).")
-
psm_dict = psm_list.get_psm_dict()
for runs in psm_dict.values():
- for run, psms in track(runs.items(), description="Extracting RT/IM values..."):
+ for run, psms in runs.items():
psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
spectrum_file = infer_spectrum_path(config["spectrum_path"], run)
-
- if spectrum_file.suffix.lower() == ".mzml":
- rt_dict, im_dict = _parse_values_from_mzml(
- spectrum_file, config, run, missing_rt, missing_im
- )
- elif spectrum_file.suffix.lower() == ".mgf":
- rt_dict, im_dict = _parse_values_from_mgf(
- spectrum_file, config, run, missing_rt, missing_im
- )
-
- for value_dict, value in zip([rt_dict, im_dict], ["retention_time", "ion_mobility"]):
- if value_dict:
- try:
- psm_list_run[value] = [value_dict[psm.spectrum_id] for psm in psm_list_run]
- except KeyError:
- raise ParsingError(
- f"Could not parse {value} values from spectrum file for run {run}."
- )
-
-
-def _parse_values_from_mgf(
- spectrum_file, config, run, missing_rt, missing_im
-) -> Tuple[Dict, Dict]:
- """
- Parse retention time and/or ion mobility from an MGF file.
-
- Notes
- -----
- - Extracting values (e.g., ion mobility) according to the Matrix documentation:
- http://www.matrixscience.com/help/data_file_help.html
-
- """
- rt_dict = {}
- im_dict = {}
-
- spectrum_id_pattern = re.compile(
- config["spectrum_id_pattern"] if config["spectrum_id_pattern"] else r"(.*)"
- )
-
- for spectrum in MGF(str(spectrum_file)):
- matched_id = spectrum_id_pattern.match(spectrum["params"]["title"]).group()
- if missing_rt:
- try:
- rt_dict[matched_id] = float(spectrum["params"]["rtinseconds"])
- except KeyError:
- raise ParsingError(
- "Could not parse retention time (`rtinseconds`) from spectrum file for "
- f"run {run}. Please make sure that the retention time key is present in the "
- "spectrum file or disable the relevant feature generator."
- )
- if missing_im:
- try:
- im_dict[matched_id] = float(spectrum["params"]["ion_mobility"])
- except KeyError:
- raise ParsingError(
- "Could not parse ion mobility (`ion_mobility`) from spectrum file "
- f"for run {run}. Please make sure that the ion mobility key is present in the "
- "spectrum file or disable the relevant feature generator."
- )
-
- return rt_dict, im_dict
-
-
-def _parse_values_from_mzml(
- spectrum_file, config, run, missing_rt, missing_im
-) -> Tuple[Dict, Dict]:
- """Parse retention time and/or ion mobility from an mzML file."""
- rt_dict = {}
- im_dict = {}
-
- spectrum_id_pattern = re.compile(
- config["spectrum_id_pattern"] if config["spectrum_id_pattern"] else r"(.*)"
- )
-
- for spectrum in MzML(str(spectrum_file)):
- matched_id = spectrum_id_pattern.match(spectrum["id"]).group()
- if missing_rt:
- try:
- rt_dict[matched_id] = float(spectrum["scanList"]["scan"][0]["scan start time"])
- except KeyError:
- raise ParsingError(
- "Could not parse retention time (`scan start time`) from spectrum file for "
- f"run {run}. Please make sure that the retention time key is present in the "
- "spectrum file or disable the relevant feature generator."
- )
- if missing_im:
- try:
- im_dict[matched_id] = float(
- spectrum["scanList"]["scan"][0]["reverse ion mobility"]
- )
- except KeyError:
- raise ParsingError(
- "Could not parse ion mobility (`reverse ion mobility`) from spectrum file "
- f"for run {run}. Please make sure that the ion mobility key is present in the "
- "spectrum file or disable the relevant feature generator."
- )
-
- return rt_dict, im_dict
-
-
-class ParseMGFError(MS2RescoreError):
- """Error parsing MGF file."""
-
- pass
-
-
-class ParsingError(MS2RescoreError):
+ logger.debug("Reading spectrum file: '%s'", spectrum_file)
+ precursors = get_precursor_info(str(spectrum_file))
+
+ if config["spectrum_id_pattern"]:
+ spectrum_id_pattern = re.compile(config["spectrum_id_pattern"])
+ precursors = {
+ spectrum_id_pattern.search(spectrum_id).group(1): precursor
+ for spectrum_id, precursor in precursors.items()
+ }
+
+ for psm in psm_list_run:
+ try:
+ if rt_required:
+ psm.retention_time = precursors[psm.spectrum_id].rt
+ if im_required:
+ psm.ion_mobility = precursors[psm.spectrum_id].im
+ if not psm.precursor_mz:
+ psm.precursor_mz = precursors[psm.spectrum_id].mz
+ except KeyError as e:
+ raise SpectrumParsingError(
+ f"Could not extract missing RT/IM values from spectrum file for run {run}."
+ ) from e
+
+
+class SpectrumParsingError(MS2RescoreError):
"""Error parsing retention time from spectrum file."""
pass
diff --git a/ms2rescore/report/charts.py b/ms2rescore/report/charts.py
index e601d826..669bcc62 100644
--- a/ms2rescore/report/charts.py
+++ b/ms2rescore/report/charts.py
@@ -198,6 +198,7 @@ def score_scatter_plot(
after: mokapot.LinearConfidence,
level: str = "psms",
indexer: str = "index",
+ fdr_threshold: float = 0.01,
) -> go.Figure:
"""
Plot PSM scores before and after rescoring.
@@ -214,6 +215,14 @@ def score_scatter_plot(
Column with index for each PSM, peptide, or protein to use for merging data frames.
"""
+ if not before or not after:
+ figure = go.Figure()
+ figure.add_annotation(
+ text="No data available for comparison.",
+ showarrow=False,
+ )
+ return figure
+
# Restructure data
merge_columns = [indexer, "mokapot score", "mokapot q-value", "mokapot PEP"]
ce_psms_targets = pd.merge(
@@ -233,16 +242,22 @@ def score_scatter_plot(
ce_psms = pd.concat([ce_psms_targets, ce_psms_decoys], axis=0)
# Get score thresholds
- score_threshold_before = (
- ce_psms[ce_psms["mokapot q-value before"] <= 0.01]
- .sort_values("mokapot q-value before", ascending=False)["mokapot score before"]
- .iloc[0]
- )
- score_threshold_after = (
- ce_psms[ce_psms["mokapot q-value after"] <= 0.01]
- .sort_values("mokapot q-value after", ascending=False)["mokapot score after"]
- .iloc[0]
- )
+ try:
+ score_threshold_before = (
+ ce_psms[ce_psms["mokapot q-value before"] <= fdr_threshold]
+ .sort_values("mokapot q-value before", ascending=False)["mokapot score before"]
+ .iloc[0]
+ )
+ except IndexError: # No PSMs below threshold
+ score_threshold_before = None
+ try:
+ score_threshold_after = (
+ ce_psms[ce_psms["mokapot q-value after"] <= fdr_threshold]
+ .sort_values("mokapot q-value after", ascending=False)["mokapot score after"]
+ .iloc[0]
+ )
+ except IndexError: # No PSMs below threshold
+ score_threshold_after = None
# Plot
fig = px.scatter(
@@ -259,10 +274,12 @@ def score_scatter_plot(
},
)
# draw FDR thresholds
- fig.add_vline(x=score_threshold_before, line_dash="dash", row=1, col=1)
- fig.add_hline(y=score_threshold_after, line_dash="dash", row=1, col=1)
- fig.add_vline(x=score_threshold_before, line_dash="dash", row=2, col=1)
- fig.add_hline(y=score_threshold_after, line_dash="dash", row=1, col=2)
+ if score_threshold_before:
+ fig.add_vline(x=score_threshold_before, line_dash="dash", row=1, col=1)
+ fig.add_vline(x=score_threshold_before, line_dash="dash", row=2, col=1)
+ if score_threshold_after:
+ fig.add_hline(y=score_threshold_after, line_dash="dash", row=1, col=1)
+ fig.add_hline(y=score_threshold_after, line_dash="dash", row=1, col=2)
return fig
@@ -288,6 +305,14 @@ def fdr_plot_comparison(
Column with index for each PSM, peptide, or protein to use for merging dataframes.
"""
+ if not before or not after:
+ figure = go.Figure()
+ figure.add_annotation(
+ text="No data available for comparison.",
+ showarrow=False,
+ )
+ return figure
+
# Prepare data
ce_psms_targets_melted = (
pd.merge(
@@ -336,7 +361,7 @@ def fdr_plot_comparison(
def identification_overlap(
before: mokapot.LinearConfidence,
after: mokapot.LinearConfidence,
-) -> go.Figure():
+) -> go.Figure:
"""
Plot stacked bar charts of removed, retained, and gained PSMs, peptides, and proteins.
@@ -348,8 +373,16 @@ def identification_overlap(
Mokapot linear confidence results after rescoring.
"""
+ if not before or not after:
+ figure = go.Figure()
+ figure.add_annotation(
+ text="No data available for comparison.",
+ showarrow=False,
+ )
+ return figure
+
levels = before.levels # ["psms", "peptides", "proteins"] if all available
- indexers = ["index", "index", "mokapot protein group"]
+ indexers = ["index", "peptide", "mokapot protein group"]
overlap_data = defaultdict(dict)
for level, indexer in zip(levels, indexers):
@@ -362,7 +395,7 @@ def identification_overlap(
set_after = set(df_after[df_after["mokapot q-value"] <= 0.01][indexer])
overlap_data["removed"][level] = -len(set_before - set_after)
- overlap_data["retained"][level] = len(set_before | set_after)
+ overlap_data["retained"][level] = len(set_after.intersection(set_before))
overlap_data["gained"][level] = len(set_after - set_before)
colors = ["#953331", "#316395", "#319545"]
diff --git a/ms2rescore/report/generate.py b/ms2rescore/report/generate.py
index 090db873..0ac092af 100644
--- a/ms2rescore/report/generate.py
+++ b/ms2rescore/report/generate.py
@@ -82,7 +82,7 @@ def generate_report(
# Read config
config = json.loads(files["configuration"].read_text())
- logger.info("Recalculating confidence estimates...")
+ logger.debug("Recalculating confidence estimates...")
fasta_file = config["ms2rescore"]["fasta_file"]
confidence_before, confidence_after = get_confidence_estimates(psm_list, fasta_file)
@@ -139,7 +139,7 @@ def generate_report(
def _collect_files(output_path_prefix, use_txt_log=False):
"""Collect all files generated by MS²Rescore."""
- logger.info("Collecting files...")
+ logger.debug("Collecting files...")
files = {
"PSMs": Path(output_path_prefix + ".psms.tsv").resolve(),
"configuration": Path(output_path_prefix + ".full-config.json").resolve(),
@@ -151,7 +151,7 @@ def _collect_files(output_path_prefix, use_txt_log=False):
}
for file, path in files.items():
if Path(path).is_file():
- logger.info("✅ Found %s: '%s'", file, path.as_posix())
+ logger.debug("✅ Found %s: '%s'", file, path.as_posix())
else:
logger.warning("❌ %s: '%s'", file, path.as_posix())
files[file] = None
@@ -164,6 +164,11 @@ def _get_stats_context(confidence_before, confidence_after):
levels = ["psms", "peptides", "proteins"]
level_names = ["PSMs", "Peptides", "Protein groups"]
card_colors = ["card-bg-blue", "card-bg-green", "card-bg-red"]
+
+ # Cannot report stats if confidence estimates are not present
+ if not confidence_before or not confidence_after:
+ return stats
+
for level, level_name, card_color in zip(levels, level_names, card_colors):
try:
before = confidence_before.accepted[level.lower()]
@@ -178,7 +183,7 @@ def _get_stats_context(confidence_before, confidence_after):
"item": level_name,
"card_color": card_color,
"number": after,
- "diff": f"{after - before:+}",
+ "diff": f"({after - before:+})",
"percentage": f"{increase:.1f}%",
"is_increase": increase > 0,
"bar_percentage": before / after * 100 if increase > 0 else after / before * 100,
diff --git a/ms2rescore/report/utils.py b/ms2rescore/report/utils.py
index e8568e59..069a641f 100644
--- a/ms2rescore/report/utils.py
+++ b/ms2rescore/report/utils.py
@@ -51,39 +51,26 @@ def get_confidence_estimates(
"was generated by MS²Rescore. Could not generate report."
) from e
+ score_after = psm_list["score"]
peptide = (
pd.Series(psm_list["peptidoform"]).astype(str).str.replace(r"(/\d+$)", "", n=1, regex=True)
)
- psms = pd.DataFrame(
- {
- "peptide": peptide,
- "is_target": ~psm_list["is_decoy"],
- "before": score_before,
- "after": psm_list["score"],
- }
- ).reset_index()
-
+ psms = pd.DataFrame({"peptide": peptide, "is_target": ~psm_list["is_decoy"]}).reset_index()
+ lin_psm_dataset = LinearPsmDataset(
+ psms=psms,
+ target_column="is_target",
+ spectrum_columns="index",
+ peptide_column="peptide",
+ )
if fasta_file:
fasta = read_fasta(fasta_file)
+ lin_psm_dataset.add_proteins(fasta)
confidence = dict()
- for when in ["before", "after"]:
- lin_psm_dataset = LinearPsmDataset(
- psms=psms,
- target_column="is_target",
- spectrum_columns="index",
- feature_columns=[when],
- peptide_column="peptide",
- )
- if fasta_file:
- lin_psm_dataset.add_proteins(fasta)
-
+ for when, scores in [("before", score_before), ("after", score_after)]:
try:
- confidence[when] = lin_psm_dataset.assign_confidence()
- except RuntimeError as e:
- raise ReportGenerationError(
- f"Error while assigning confidence estimates to PSMs ({when} rescoring). "
- "Could not generate report."
- ) from e
+ confidence[when] = lin_psm_dataset.assign_confidence(scores=scores)
+ except RuntimeError:
+ confidence[when] = None
return confidence["before"], confidence["after"]
diff --git a/ms2rescore/rescoring_engines/mokapot.py b/ms2rescore/rescoring_engines/mokapot.py
index f3927f47..f02d877f 100644
--- a/ms2rescore/rescoring_engines/mokapot.py
+++ b/ms2rescore/rescoring_engines/mokapot.py
@@ -20,7 +20,8 @@
"""
import logging
-from typing import Any, List, Optional, Tuple, Dict
+import re
+from typing import Any, Dict, List, Optional, Tuple
import mokapot
import numpy as np
@@ -28,15 +29,20 @@
import psm_utils
from mokapot.brew import brew
from mokapot.dataset import LinearPsmDataset
+from mokapot.model import PercolatorModel
from pyteomics.mass import nist_mass
+from ms2rescore.exceptions import RescoringError
+
logger = logging.getLogger(__name__)
+logging.getLogger("numba").setLevel(logging.WARNING)
def rescore(
psm_list: psm_utils.PSMList,
output_file_root: str = "ms2rescore",
fasta_file: Optional[str] = None,
+ train_fdr: float = 0.01,
write_weights: bool = False,
write_txt: bool = False,
write_flashlfq: bool = False,
@@ -63,6 +69,8 @@ def rescore(
fasta_file
Path to FASTA file with protein sequences to use for protein inference. Defaults to
``None``.
+ train_fdr
+ FDR to use for training the Mokapot model. Defaults to ``0.01``.
write_weights
Write model weights to a text file. Defaults to ``False``.
write_txt
@@ -89,27 +97,15 @@ def rescore(
# Rescore
logger.debug(f"Mokapot brew options: `{kwargs}`")
- confidence_results, models = brew(lin_psm_data, **kwargs)
-
- # Reshape confidence estimates to match PSMList
- mokapot_values_targets = (
- confidence_results.confidence_estimates["psms"]
- .set_index("index")
- .sort_index()[["mokapot score", "mokapot q-value", "mokapot PEP"]]
- )
- mokapot_values_decoys = (
- confidence_results.decoy_confidence_estimates["psms"]
- .set_index("index")
- .sort_index()[["mokapot score", "mokapot q-value", "mokapot PEP"]]
- )
- q = np.full((len(psm_list), 3), np.nan)
- q[mokapot_values_targets.index] = mokapot_values_targets.values
- q[mokapot_values_decoys.index] = mokapot_values_decoys.values
+ try:
+ confidence_results, models = brew(
+ lin_psm_data, model=PercolatorModel(train_fdr=train_fdr), rng=8, **kwargs
+ )
+ except RuntimeError as e:
+ raise RescoringError("Mokapot could not be run. Please check the input data.") from e
- # Add Mokapot results to PSMList
- psm_list["score"] = q[:, 0]
- psm_list["qvalue"] = q[:, 1]
- psm_list["pep"] = q[:, 2]
+ add_psm_confidence(psm_list, confidence_results)
+ add_peptide_confidence(psm_list, confidence_results)
# Write results
if write_weights:
@@ -171,6 +167,10 @@ def convert_psm_list(
feature_df.columns = [f"feature:{f}" for f in feature_df.columns]
combined_df = pd.concat([psm_df[required_columns], feature_df], axis=1)
+ # Ensure filename for FlashLFQ txt output
+ if not combined_df["run"].notnull().all():
+ combined_df["run"] = "na"
+
feature_names = [f"feature:{f}" for f in feature_names] if feature_names else None
lin_psm_data = LinearPsmDataset(
@@ -220,6 +220,58 @@ def save_model_weights(
)
+def add_psm_confidence(
+ psm_list: psm_utils.PSMList, confidence_results: mokapot.confidence.Confidence
+) -> None:
+ """Add PSM-level confidence estimates to PSM list, updating score, qvalue, pep, and rank."""
+ # Reshape confidence estimates to match PSMList
+ keys = ["mokapot score", "mokapot q-value", "mokapot PEP"]
+ mokapot_values_targets = (
+ confidence_results.confidence_estimates["psms"].set_index("index").sort_index()[keys]
+ )
+ mokapot_values_decoys = (
+ confidence_results.decoy_confidence_estimates["psms"].set_index("index").sort_index()[keys]
+ )
+ q = np.full((len(psm_list), 3), np.nan)
+ q[mokapot_values_targets.index] = mokapot_values_targets.values
+ q[mokapot_values_decoys.index] = mokapot_values_decoys.values
+
+ # Add Mokapot results to PSMList
+ psm_list["score"] = q[:, 0]
+ psm_list["qvalue"] = q[:, 1]
+ psm_list["pep"] = q[:, 2]
+
+ # Reset ranks to match new scores
+ psm_list.set_ranks(lower_score_better=False)
+
+
+def add_peptide_confidence(
+ psm_list: psm_utils.PSMList, confidence_results: mokapot.confidence.Confidence
+) -> None:
+ """Add Mokapot peptide-level confidence estimates to PSM list."""
+ keys = ["mokapot score", "mokapot q-value", "mokapot PEP"]
+ peptide_info = pd.concat(
+ [
+ confidence_results.confidence_estimates["peptides"].set_index("peptide")[keys],
+ confidence_results.decoy_confidence_estimates["peptides"].set_index("peptide")[keys],
+ ],
+ axis=0,
+ ).to_dict(orient="index")
+
+ # Add peptide-level scores to PSM metadata
+ # run_key = "na" if not all(psm.run for psm in psm_list) else None
+ no_charge_pattern = re.compile(r"(/\d+$)")
+ for psm in psm_list:
+ peptide_scores = peptide_info[(no_charge_pattern.sub("", str(psm.peptidoform), 1))]
+ psm.metadata.update(
+ {
+ "peptide_score": peptide_scores["mokapot score"],
+ "peptide_qvalue": peptide_scores["mokapot q-value"],
+ "peptide_pep": peptide_scores["mokapot PEP"],
+ }
+ )
+
+
def _mz_to_mass(mz: float, charge: int) -> float:
"""Convert m/z to mass."""
return mz * charge - charge * nist_mass["H"][1][0]
diff --git a/ms2rescore/rescoring_engines/percolator.py b/ms2rescore/rescoring_engines/percolator.py
index 5f7d4e5d..9952aed5 100644
--- a/ms2rescore/rescoring_engines/percolator.py
+++ b/ms2rescore/rescoring_engines/percolator.py
@@ -20,8 +20,8 @@
import logging
import subprocess
from typing import Any, Dict, Optional
+from copy import deepcopy
-import numpy as np
import psm_utils
from ms2rescore.exceptions import MS2RescoreError
@@ -103,8 +103,15 @@ def rescore(
# Need to be able to link back to original PSMs, so reindex spectrum IDs, but copy PSM list
# to avoid modifying original...
# TODO: Better approach for this?
- psm_list_reindexed = psm_list.copy()
- psm_list_reindexed["spectrum_id"] = np.arange(len(psm_list_reindexed))
+
+ psm_list_reindexed = deepcopy(psm_list)
+ psm_list_reindexed.set_ranks()
+ psm_list_reindexed["spectrum_id"] = [
+ f"{psm.get_usi(as_url=False)}_{psm.rank}" for psm in psm_list_reindexed
+ ]
+ spectrum_id_index = {
+ spectrum_id: index for index, spectrum_id in enumerate(psm_list_reindexed["spectrum_id"])
+ }
_write_pin_file(psm_list_reindexed, pin_filepath)
@@ -134,10 +141,13 @@ def rescore(
psm_list,
percolator_kwargs["results-psms"],
percolator_kwargs["decoy-results-psms"],
+ spectrum_id_index,
)
-def _update_psm_scores(psm_list: psm_utils.PSMList, target_pout: str, decoy_pout: str):
+def _update_psm_scores(
+ psm_list: psm_utils.PSMList, target_pout: str, decoy_pout: str, spectrum_id_index: list
+):
"""
Update PSM scores with Percolator results.
@@ -150,7 +160,9 @@ def _update_psm_scores(psm_list: psm_utils.PSMList, target_pout: str, decoy_pout
psm_list_percolator = psm_utils.PSMList(psm_list=target_psms.psm_list + decoy_psms.psm_list)
# Sort by reindexed spectrum_id so order matches original PSM list
- psm_list_percolator[np.argsort(psm_list_percolator["spectrum_id"])]
+ psm_list_percolator = sorted(
+ psm_list_percolator, key=lambda psm: spectrum_id_index[psm["spectrum_id"]]
+ )
if not len(psm_list) == len(psm_list_percolator):
raise MS2RescoreError(
@@ -163,6 +175,8 @@ def _update_psm_scores(psm_list: psm_utils.PSMList, target_pout: str, decoy_pout
original_psm["qvalue"] = new_psm["qvalue"]
original_psm["pep"] = new_psm["pep"]
+ psm_list.set_ranks(lower_score_better=False)
+
def _write_pin_file(psm_list: psm_utils.PSMList, filepath: str):
"""Write PIN file for rescoring."""
diff --git a/ms2rescore/utils.py b/ms2rescore/utils.py
index 70417b56..3515893a 100644
--- a/ms2rescore/utils.py
+++ b/ms2rescore/utils.py
@@ -35,44 +35,61 @@ def infer_spectrum_path(
"and no run name in PSM file found."
)
- # If passed path is directory, join with run name
- elif os.path.isdir(configured_path):
- if run_name:
- resolved_path = os.path.join(configured_path, run_name)
+ else:
+ is_bruker_dir = configured_path.endswith(".d") or _is_minitdf(configured_path)
+
+ # If passed path is directory (that is not Bruker raw), join with run name
+ if os.path.isdir(configured_path) and not is_bruker_dir:
+ if run_name:
+ resolved_path = os.path.join(configured_path, run_name)
+ else:
+ raise MS2RescoreConfigurationError(
+ "Could not resolve spectrum file name: Spectrum path is directory "
+ "but no run name in PSM file found."
+ )
+
+ # If passed path is file, use that, but warn if basename doesn't match expected
+ elif os.path.isfile(configured_path) or (os.path.isdir(configured_path) and is_bruker_dir):
+ if run_name and Path(configured_path).stem != Path(run_name).stem:
+ logger.warning(
+ "Passed spectrum path (`%s`) does not match run name found in PSM "
+ "file (`%s`). Continuing with passed spectrum path.",
+ configured_path,
+ run_name,
+ )
+ resolved_path = configured_path
else:
raise MS2RescoreConfigurationError(
- "Could not resolve spectrum file name: Spectrum path is directory "
- "but no run name in PSM file found."
- )
-
- # If passed path is file, use that, but warn if basename doesn't match expected
- elif os.path.isfile(configured_path):
- if run_name and Path(configured_path).stem != Path(run_name).stem:
- logger.warning(
- "Passed spectrum path (`%s`) does not match run name found in PSM "
- "file (`%s`). Continuing with passed spectrum path.",
- configured_path,
- run_name,
+ "Configured `spectrum_path` must be `None` or a path to an existing file "
+ "or directory. If `None` or path to directory, spectrum run information "
+ "should be present in the PSM file."
)
- resolved_path = configured_path
- else:
- raise MS2RescoreConfigurationError(
- "Configured `spectrum_path` must be `None` or a path to an existing file "
- "or directory. If `None` or path to directory, spectrum run information "
- "should be present in the PSM file."
- )
# Match with file extension if not in resolved_path yet
- if not re.match(".mgf$|.mzml$", resolved_path, flags=re.IGNORECASE):
+ if not _is_minitdf(resolved_path) and not re.match(
+ r"\.mgf$|\.mzml$|\.d$", resolved_path, flags=re.IGNORECASE
+ ):
for filename in glob(resolved_path + "*"):
- if re.match(r".*(\.mgf$|\.mzml$)", filename, flags=re.IGNORECASE):
+ if re.match(r".*(\.mgf$|\.mzml$|\.d)", filename, flags=re.IGNORECASE):
resolved_path = filename
break
else:
raise MS2RescoreConfigurationError(
- "Resolved spectrum filename does not contain a supported file "
- "extension (mgf or mzml) and could not find any matching existing "
+ f"Resolved spectrum filename ('{resolved_path}') does not contain a supported "
+ "file extension (mzML, MGF, or .d) and could not find any matching existing "
"files."
)
return Path(resolved_path)
+
+
+def _is_minitdf(spectrum_file: str) -> bool:
+ """
+ Check if the spectrum file is a Bruker miniTDF folder.
+
+ A Bruker miniTDF folder has no fixed name, but contains files matching the patterns
+ ``*ms2spectrum.bin`` and ``*ms2spectrum.parquet``.
+ """
+ files = set(Path(spectrum_file).glob("*ms2spectrum.bin"))
+ files.update(Path(spectrum_file).glob("*ms2spectrum.parquet"))
+ return len(files) >= 2
diff --git a/pyproject.toml b/pyproject.toml
index 8f135149..694fa068 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,24 +32,24 @@ classifiers = [
dynamic = ["version"]
requires-python = ">=3.8"
dependencies = [
- "numpy>=1.16.0; python_version != '3.11'",
- "numpy==1.24.3; python_version == '3.11'", # Incompatibility with sklearn, pygam, and TF...
- "pandas>=1.0",
- "rich>=12",
- "pyteomics>=4.1.0",
- "lxml>=4.5",
- "ms2pip>=4.0.0-dev4",
- "click>=7",
"cascade-config>=0.4.0",
+ "click>=7",
+ "customtkinter>=5,<6",
"deeplc>=2.2",
"deeplcretrainer>=0.2",
- "tomli>=2; python_version < '3.11'",
- "psm_utils>=0.4",
- "customtkinter>=5,<6",
- "mokapot>=0.9",
- "pydantic>=1.8.2,<2", # Fix compatibility with v2 in psm_utils
+ "im2deep>=0.1.3",
"jinja2>=3",
+ "lxml>=4.5",
+ "mokapot>=0.9",
+ "ms2pip>=4.0.0-dev10",
+ "ms2rescore_rs",
+ "numpy>=1.16.0",
+ "pandas>=1.0",
"plotly>=5",
+ "psm_utils>=0.9",
+ "pyteomics>=4.7.2",
+ "rich>=12",
+ "tomli>=2; python_version < '3.11'",
]
[project.optional-dependencies]
@@ -78,6 +78,7 @@ CompOmics = "https://www.compomics.com"
ms2rescore = "ms2rescore.__main__:main"
ms2rescore-gui = "ms2rescore.gui.__main__:main"
ms2rescore-report = "ms2rescore.report.__main__:main"
+tims2rescore = "ms2rescore.__main__:main_tims"
[build-system]
requires = ["flit_core >=3.2,<4"]
diff --git a/tests/test_data/test.mgf b/tests/test_data/test.mgf
new file mode 100644
index 00000000..e4899c08
--- /dev/null
+++ b/tests/test_data/test.mgf
@@ -0,0 +1,13 @@
+BEGIN IONS
+TITLE=peptide: peptide1
+CHARGE=2+
+PEPMASS=475.137295
+ION_MOBILITY=42.42
+RTINSECONDS=51.2
+72.04439 100
+148.06043 600
+232.07504 300
+263.08737 400
+347.10198 500
+423.11802 200
+END IONS
diff --git a/tests/test_parse_spectra.py b/tests/test_parse_spectra.py
new file mode 100644
index 00000000..4dffc9dc
--- /dev/null
+++ b/tests/test_parse_spectra.py
@@ -0,0 +1,23 @@
+import pytest
+from psm_utils import PSM, PSMList
+
+from ms2rescore.parse_spectra import get_missing_values
+
+
+def test_get_missing_values():
+ psm_list = PSMList(
+ psm_list=[
+ PSM(peptidoform="PEPTIDEK/2", spectrum_id="peptide1"),
+ ]
+ )
+ get_missing_values(
+ psm_list,
+ config={
+ "spectrum_path": "tests/test_data/test.mgf",
+ "spectrum_id_pattern": "peptide: (.*)",
+ },
+ rt_required=True,
+ im_required=True,
+ )
+ assert psm_list[0].retention_time == pytest.approx(0.853, 0.001)
+ assert psm_list[0].ion_mobility == pytest.approx(42.42, 0.01)