Skip to content

Commit

Permalink
Merge pull request #162 from compomics/refactor-parse-spectra
Browse files Browse the repository at this point in the history
Refactor parsing of precursor data from spectrum files
  • Loading branch information
RalfG authored Dec 5, 2024
2 parents f8cab91 + b417a6c commit 581da04
Show file tree
Hide file tree
Showing 9 changed files with 318 additions and 101 deletions.
73 changes: 18 additions & 55 deletions ms2rescore/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,14 @@
from multiprocessing import cpu_count
from typing import Dict, Optional

import numpy as np
import psm_utils.io
from mokapot.dataset import LinearPsmDataset
from psm_utils import PSMList

from ms2rescore import exceptions
from ms2rescore.feature_generators import FEATURE_GENERATORS
from ms2rescore.parse_psms import parse_psms
from ms2rescore.parse_spectra import get_missing_values
from ms2rescore.parse_spectra import add_precursor_values
from ms2rescore.report import generate
from ms2rescore.rescoring_engines import mokapot, percolator
from ms2rescore.rescoring_engines.mokapot import add_peptide_confidence, add_psm_confidence
Expand Down Expand Up @@ -62,20 +61,28 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
)

# Add missing precursor info from spectrum file if needed
psm_list = _fill_missing_precursor_info(psm_list, config)
available_ms_data = add_precursor_values(
psm_list, config["spectrum_path"], config["spectrum_id_pattern"]
)

# Add rescoring features
for fgen_name, fgen_config in config["feature_generators"].items():
# TODO: Handle this somewhere else, more generally?
if fgen_name == "maxquant" and not (psm_list["source"] == "msms").all():
logger.warning(
"MaxQuant feature generator requires PSMs from a MaxQuant msms.txt file. Skipping "
"this feature generator."
)
continue
# Compile configuration
conf = config.copy()
conf.update(fgen_config)
fgen = FEATURE_GENERATORS[fgen_name](**conf)

# Check if required MS data is available
missing_ms_data = fgen.required_ms_data - available_ms_data
if missing_ms_data:
logger.warning(
f"Skipping feature generator {fgen_name} because required MS data is missing: "
f"{missing_ms_data}. Ensure that the required MS data is present in the input "
"files or disable the feature generator."
)
continue

# Add features
fgen.add_features(psm_list)
logger.debug(f"Adding features from {fgen_name}: {set(fgen.feature_names)}")
feature_names[fgen_name] = set(fgen.feature_names)
Expand All @@ -102,6 +109,7 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
# Write feature names to file
_write_feature_names(feature_names, output_file_root)

# Rename PSMs to USIs if requested
if config["rename_to_usi"]:
logging.debug(f"Creating USIs for {len(psm_list)} PSMs")
psm_list["spectrum_id"] = [psm.get_usi(as_url=False) for psm in psm_list]
Expand Down Expand Up @@ -183,51 +191,6 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
logger.exception(e)


def _fill_missing_precursor_info(psm_list: PSMList, config: Dict) -> PSMList:
"""Fill missing precursor info from spectrum file if needed."""
# Check if required
# TODO: avoid hard coding feature generators in some way
rt_required = ("deeplc" in config["feature_generators"]) and any(
v is None or v == 0 or np.isnan(v) for v in psm_list["retention_time"]
)
im_required = (
"ionmob" in config["feature_generators"] or "im2deep" in config["feature_generators"]
) and any(v is None or v == 0 or np.isnan(v) for v in psm_list["ion_mobility"])
logger.debug(f"RT required: {rt_required}, IM required: {im_required}")

# Add missing values
if rt_required or im_required:
logger.info("Parsing missing retention time and/or ion mobility values from spectra...")
get_missing_values(psm_list, config, rt_required=rt_required, im_required=im_required)

# Check if values are now present
for value_name, required in [("retention_time", rt_required), ("ion_mobility", im_required)]:
if required and (
0.0 in psm_list[value_name]
or None in psm_list[value_name]
or np.isnan(psm_list[value_name]).any()
):
if all(v is None or v == 0.0 or np.isnan(v) for v in psm_list[value_name]):
raise exceptions.MissingValuesError(
f"Could not find any '{value_name}' values in PSM or spectrum files. Disable "
f"feature generators that require '{value_name}' or ensure that the values are "
"present in the input files."
)
else:
missing_value_psms = psm_list[
[v is None or np.isnan(v) for v in psm_list[value_name]]
]
logger.warning(
f"Found {len(missing_value_psms)} PSMs with missing '{value_name}' values. "
"These PSMs will be removed."
)
psm_list = psm_list[
[v is not None and not np.isnan(v) for v in psm_list[value_name]]
]

return psm_list


def _filter_by_rank(psm_list: PSMList, max_rank: int, lower_score_better: bool) -> PSMList:
"""Filter PSMs by rank."""
psm_list.set_ranks(lower_score_better=lower_score_better)
Expand Down
6 changes: 6 additions & 0 deletions ms2rescore/feature_generators/base.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,17 @@
from abc import ABC, abstractmethod
from typing import Set

from psm_utils import PSMList

from ms2rescore.parse_spectra import MSDataType


class FeatureGeneratorBase(ABC):
"""Base class from which all feature generators must inherit."""

# List of required MS data types for feature generation
required_ms_data: Set[MSDataType] = set()

def __init__(self, *args, **kwargs) -> None:
super().__init__()

Expand Down
4 changes: 4 additions & 0 deletions ms2rescore/feature_generators/deeplc.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from psm_utils import PSMList

from ms2rescore.feature_generators.base import FeatureGeneratorBase
from ms2rescore.parse_spectra import MSDataType

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
logger = logging.getLogger(__name__)
Expand All @@ -35,6 +36,8 @@
class DeepLCFeatureGenerator(FeatureGeneratorBase):
"""DeepLC retention time-based feature generator."""

required_ms_data = {MSDataType.retention_time}

def __init__(
self,
*args,
Expand Down Expand Up @@ -138,6 +141,7 @@ def add_features(self, psm_list: PSMList) -> None:
)

# Disable wild logging to stdout by Tensorflow, unless in debug mode

with contextlib.redirect_stdout(
open(os.devnull, "w", encoding="utf-8")
) if not self._verbose else contextlib.nullcontext():
Expand Down
3 changes: 3 additions & 0 deletions ms2rescore/feature_generators/im2deep.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from psm_utils import PSMList

from ms2rescore.feature_generators.base import FeatureGeneratorBase
from ms2rescore.parse_spectra import MSDataType

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
logger = logging.getLogger(__name__)
Expand All @@ -30,6 +31,8 @@
class IM2DeepFeatureGenerator(FeatureGeneratorBase):
"""IM2Deep collision cross section feature generator."""

required_ms_data = {MSDataType.ion_mobility}

def __init__(
self,
*args,
Expand Down
3 changes: 3 additions & 0 deletions ms2rescore/feature_generators/ionmob.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from psm_utils import Peptidoform, PSMList

from ms2rescore.feature_generators.base import FeatureGeneratorBase, FeatureGeneratorException
from ms2rescore.parse_spectra import MSDataType

try:
from ionmob import __file__ as ionmob_file
Expand Down Expand Up @@ -55,6 +56,8 @@
class IonMobFeatureGenerator(FeatureGeneratorBase):
"""Ionmob collisional cross section (CCS)-based feature generator."""

required_ms_data = {MSDataType.ion_mobility}

def __init__(
self,
*args,
Expand Down
37 changes: 26 additions & 11 deletions ms2rescore/feature_generators/maxquant.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,16 @@
class MaxQuantFeatureGenerator(FeatureGeneratorBase):
"""Generate MaxQuant-derived features."""

available_features = [
"mean_error_top7",
"sq_mean_error_top7",
"stdev_error_top7",
"ln_explained_ion_current",
"ln_nterm_ion_current_ratio",
"ln_cterm_ion_current_ratio",
"ln_ms2_ion_current",
]

def __init__(self, *args, **kwargs) -> None:
"""
Generate MaxQuant-derived features.
Expand All @@ -39,30 +49,30 @@ def __init__(self, *args, **kwargs) -> None:
"""
super().__init__(*args, **kwargs)
self._feature_names = self.available_features.copy()

@property
def feature_names(self) -> List[str]:
return [
"mean_error_top7",
"sq_mean_error_top7",
"stdev_error_top7",
"ln_explained_ion_current",
"ln_nterm_ion_current_ratio",
"ln_cterm_ion_current_ratio",
"ln_ms2_ion_current",
]
return self._feature_names

def add_features(self, psm_list: PSMList):
"""
Add MS²PIP-derived features to PSMs.
Add MaxQuant-derived features to PSMs.
Parameters
----------
psm_list
PSMs to add features to.
"""
logger.info("Adding MaxQuant-derived features to PSMs.")
# Check if all PSMs are from MaxQuant
if not self._all_psms_from_maxquant(psm_list):
self._feature_names = [] # Set feature names to empty list to indicate none added
logger.warning("Not all PSMs are from MaxQuant. Skipping MaxQuant feature generation.")
return
else:
self._feature_names = self.available_features # Reset feature names
logger.info("Adding MaxQuant-derived features to PSMs.")

# Infer mass deviations column name
for column_name in [
Expand Down Expand Up @@ -90,6 +100,11 @@ def add_features(self, psm_list: PSMList):
for psm in psm_list:
psm["rescoring_features"].update(self._compute_features(psm["metadata"]))

@staticmethod
def _all_psms_from_maxquant(psm_list):
"""Check if the PSMs are from MaxQuant."""
return (psm_list["source"] == "msms").all()

def _compute_features(self, psm_metadata):
"""Compute features from derived from intensities and mass errors."""
features = {}
Expand Down
3 changes: 3 additions & 0 deletions ms2rescore/feature_generators/ms2pip.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
from rich.progress import track

from ms2rescore.feature_generators.base import FeatureGeneratorBase, FeatureGeneratorException
from ms2rescore.parse_spectra import MSDataType
from ms2rescore.utils import infer_spectrum_path

logger = logging.getLogger(__name__)
Expand All @@ -46,6 +47,8 @@
class MS2PIPFeatureGenerator(FeatureGeneratorBase):
"""Generate MS²PIP-based features."""

required_ms_data = {MSDataType.ms2_spectra}

def __init__(
self,
*args,
Expand Down
Loading

0 comments on commit 581da04

Please sign in to comment.