Skip to content

Commit

Permalink
Merge branch 'timsRescore' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
ArthurDeclercq authored Mar 22, 2024
2 parents 23dfa95 + 86363ee commit ff81bd9
Show file tree
Hide file tree
Showing 17 changed files with 456 additions and 149 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ jobs:
- uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: "3.11"

Expand Down Expand Up @@ -47,7 +47,7 @@ jobs:
steps:
- uses: actions/checkout@v4

- uses: actions/setup-python@v4
- uses: actions/setup-python@v5
with:
python-version: "3.11"

Expand Down
10 changes: 3 additions & 7 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,10 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install flake8
pip install ruff
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Run Ruff
run: ruff check --output-format=github .

- name: Build and install ms2rescore package
run: |
Expand Down
11 changes: 6 additions & 5 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
FROM ubuntu:focal
FROM python:3.10

# ARG DEBIAN_FRONTEND=noninteractive

LABEL name="ms2rescore"

ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/ms2rescore
# ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/ms2rescore

ADD pyproject.toml /ms2rescore/pyproject.toml
ADD LICENSE /ms2rescore/LICENSE
Expand All @@ -11,8 +13,7 @@ ADD MANIFEST.in /ms2rescore/MANIFEST.in
ADD ms2rescore /ms2rescore/ms2rescore

RUN apt-get update \
&& apt-get install --no-install-recommends -y python3-pip procps libglib2.0-0 libsm6 libxrender1 libxext6 \
&& rm -rf /var/lib/apt/lists/* \
&& pip3 install ms2rescore/
&& apt install -y procps git-lfs \
&& pip install /ms2rescore

ENTRYPOINT [""]
11 changes: 11 additions & 0 deletions docs/source/config_schema.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
- **`deeplc`**: Refer to *[#/definitions/deeplc](#definitions/deeplc)*.
- **`maxquant`**: Refer to *[#/definitions/maxquant](#definitions/maxquant)*.
- **`ionmob`**: Refer to *[#/definitions/ionmob](#definitions/ionmob)*.
- **`im2deep`**: Refer to *[#/definitions/im2deep](#definitions/im2deep)*.
- **`rescoring_engine`** *(object)*: Rescoring engine to use and its configuration. Leave empty to skip rescoring and write features to file. Default: `{"mokapot": {}}`.
- **`.*`**: Refer to *[#/definitions/rescoring_engine](#definitions/rescoring_engine)*.
- **`percolator`**: Refer to *[#/definitions/percolator](#definitions/percolator)*.
Expand Down Expand Up @@ -47,6 +48,14 @@
- **One of**
- *string*
- *null*
- **`psm_id_rt_pattern`**: Regex pattern to extract retention time from PSM identifier. Requires at least one capturing group. Default: `null`.
- **One of**
- *string*
- *null*
- **`psm_id_im_pattern`**: Regex pattern to extract ion mobility from PSM identifier. Requires at least one capturing group. Default: `null`.
- **One of**
- *string*
- *null*
- **`lower_score_is_better`** *(boolean)*: Bool indicating if lower score is better. Default: `false`.
- **`modification_mapping`** *(object)*: Mapping of modification labels to each replacement label. Default: `{}`.
- **`fixed_modifications`** *(object)*: Mapping of amino acids with fixed modifications to the modification name. Can contain additional properties. Default: `{}`.
Expand Down Expand Up @@ -75,6 +84,8 @@
- **`ionmob_model`** *(string)*: Path to Ionmob model directory. Default: `"GRUPredictor"`.
- **`reference_dataset`** *(string)*: Path to Ionmob reference dataset file. Default: `"Meier_unimod.parquet"`.
- **`tokenizer`** *(string)*: Path to tokenizer json file. Default: `"tokenizer.json"`.
- <a id="definitions/im2deep"></a>**`im2deep`** *(object)*: Ion mobility feature generator configuration using IM2Deep. Can contain additional properties. Refer to *[#/definitions/feature_generator](#definitions/feature_generator)*.
- **`reference_dataset`** *(string)*: Path to IM2Deep reference dataset file. Default: `"Meier_unimod.parquet"`.
- <a id="definitions/mokapot"></a>**`mokapot`** *(object)*: Mokapot rescoring engine configuration. Additional properties are passed to the Mokapot brew function. Can contain additional properties. Refer to *[#/definitions/rescoring_engine](#definitions/rescoring_engine)*.
- **`write_weights`** *(boolean)*: Write Mokapot weights to a text file. Default: `false`.
- **`write_txt`** *(boolean)*: Write Mokapot results to a text file. Default: `false`.
Expand Down
14 changes: 10 additions & 4 deletions ms2rescore/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from ms2rescore.feature_generators import FEATURE_GENERATORS
from ms2rescore.parse_psms import parse_psms
from ms2rescore.parse_spectra import get_missing_values
from ms2rescore.parse_spectra import fill_missing_values
from ms2rescore.report import generate
from ms2rescore.rescoring_engines import mokapot, percolator
from ms2rescore import exceptions
Expand Down Expand Up @@ -59,11 +59,17 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
)

# TODO: avoid hard coding feature generators in some way
rt_required = "deeplc" in config["feature_generators"] and None in psm_list["retention_time"]
im_required = "ionmob" in config["feature_generators"] and None in psm_list["ion_mobility"]
rt_required = ("deeplc" in config["feature_generators"]) and (
None in psm_list["retention_time"]
)
im_required = ("ionmob" or "im2deep" in config["feature_generators"]) and (
None in psm_list["ion_mobility"]
)
logger.debug(f"RT required: {rt_required}, IM required: {im_required}")

if rt_required or im_required:
logger.info("Parsing missing retention time and/or ion mobility values from spectra...")
get_missing_values(config, psm_list, missing_rt=rt_required, missing_im=im_required)
fill_missing_values(config, psm_list, missing_rt=rt_required, missing_im=im_required)

# Add rescoring features
for fgen_name, fgen_config in config["feature_generators"].items():
Expand Down
2 changes: 2 additions & 0 deletions ms2rescore/feature_generators/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,13 @@
from ms2rescore.feature_generators.ionmob import IonMobFeatureGenerator
from ms2rescore.feature_generators.maxquant import MaxQuantFeatureGenerator
from ms2rescore.feature_generators.ms2pip import MS2PIPFeatureGenerator
from ms2rescore.feature_generators.im2deep import IM2DeepFeatureGenerator

FEATURE_GENERATORS = {
"basic": BasicFeatureGenerator,
"ms2pip": MS2PIPFeatureGenerator,
"deeplc": DeepLCFeatureGenerator,
"maxquant": MaxQuantFeatureGenerator,
"ionmob": IonMobFeatureGenerator,
"im2deep": IM2DeepFeatureGenerator,
}
55 changes: 21 additions & 34 deletions ms2rescore/feature_generators/deeplc.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,10 @@
from collections import defaultdict
from inspect import getfullargspec
from itertools import chain
from typing import List, Optional, Union
from typing import List, Union

import numpy as np
import pandas as pd
from psm_utils import PSMList
from psm_utils.io import peptide_record

from ms2rescore.feature_generators.base import FeatureGeneratorBase

Expand All @@ -41,8 +39,7 @@ def __init__(
self,
*args,
lower_score_is_better: bool = False,
calibration_set_size: Union[int, float] = 0.15,
spectrum_path: Optional[str] = None,
calibration_set_size: Union[int, float, None] = None,
processes: int = 1,
**kwargs,
) -> None:
Expand All @@ -59,9 +56,6 @@ def __init__(
calibration_set_size: int or float
Amount of best PSMs to use for DeepLC calibration. If this value is lower
than the number of available PSMs, all PSMs will be used. (default: 0.15)
spectrum_path
Path to spectrum file or directory with spectrum files. If None, inferred from `run`
field in PSMs. Defaults to None.
processes: {int, None}
Number of processes to use in DeepLC. Defaults to 1.
kwargs: dict
Expand All @@ -77,7 +71,6 @@ def __init__(

self.lower_psm_score_better = lower_score_is_better
self.calibration_set_size = calibration_set_size
self.spectrum_path = spectrum_path
self.processes = processes
self.deeplc_kwargs = kwargs or {}

Expand Down Expand Up @@ -151,17 +144,15 @@ def add_features(self, psm_list: PSMList) -> None:
# Make new PSM list for this run (chain PSMs per spectrum to flat list)
psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))

logger.debug("Calibrating DeepLC...")
psm_list_calibration = self._get_calibration_psms(psm_list_run)
logger.debug(f"Calibrating DeepLC with {len(psm_list_calibration)} PSMs...")
self.deeplc_predictor = self.DeepLC(
n_jobs=self.processes,
verbose=self._verbose,
path_model=self.selected_model or self.user_model,
**self.deeplc_kwargs,
)
self.deeplc_predictor.calibrate_preds(
seq_df=self._psm_list_to_deeplc_peprec(psm_list_calibration)
)
self.deeplc_predictor.calibrate_preds(psm_list_calibration)
# Still calibrate for each run, but do not try out all model options.
# Just use model that was selected based on first run
if not self.selected_model:
Expand All @@ -174,11 +165,7 @@ def add_features(self, psm_list: PSMList) -> None:
)

logger.debug("Predicting retention times...")
predictions = np.array(
self.deeplc_predictor.make_preds(
seq_df=self._psm_list_to_deeplc_peprec(psm_list_run)
)
)
predictions = np.array(self.deeplc_predictor.make_preds(psm_list_run))
observations = psm_list_run["retention_time"]
rt_diffs_run = np.abs(predictions - observations)

Expand All @@ -204,25 +191,25 @@ def add_features(self, psm_list: PSMList) -> None:
)
current_run += 1

# TODO: Remove when DeepLC supports PSMList directly
@staticmethod
def _psm_list_to_deeplc_peprec(psm_list: PSMList) -> pd.DataFrame:
peprec = peptide_record.to_dataframe(psm_list)
peprec = peprec.rename(
columns={
"observed_retention_time": "tr",
"peptide": "seq",
}
)[["tr", "seq", "modifications"]]
return peprec

def _get_calibration_psms(self, psm_list: PSMList):
"""Get N best scoring target PSMs for calibration."""
psm_list_targets = psm_list[~psm_list["is_decoy"]]
n_psms = self._get_number_of_calibration_psms(psm_list_targets)
indices = np.argsort(psm_list_targets["score"])
indices = indices[:n_psms] if self.lower_psm_score_better else indices[-n_psms:]
return psm_list_targets[indices]
if self.calibration_set_size:
n_psms = self._get_number_of_calibration_psms(psm_list_targets)
indices = np.argsort(psm_list_targets["score"])
indices = indices[:n_psms] if self.lower_psm_score_better else indices[-n_psms:]
return psm_list_targets[indices]
else:
identified_psms = psm_list_targets[psm_list_targets["qvalue"] <= 0.01]
if len(identified_psms) == 0:
raise ValueError(
"No target PSMs with q-value <= 0.01 found. Please set calibration set size for calibrating deeplc."
)
elif (len(identified_psms) < 500) & (self.deeplc_kwargs["deeplc_retrain"]):
logger.warning(
" Less than 500 target PSMs with q-value <= 0.01 found for retraining. Consider turning of deeplc_retrain, as this is likely not enough data for retraining."
)
return identified_psms

def _get_number_of_calibration_psms(self, psm_list):
"""Get number of calibration PSMs given `calibration_set_size` and total number of PSMs."""
Expand Down
Loading

0 comments on commit ff81bd9

Please sign in to comment.