Skip to content

Commit

Permalink
Completely refactor fasta2speclib; implement new features:
Browse files Browse the repository at this point in the history
- Support for C-terminal modifications
- Differentiate between peptide and protein termini for variable modifications
- Allow filtering of peptides based on precursor m/z
- Allow semi-specific cleavage
- Allow setting of a maximum of variable modifications per peptide
- Disable PyGAM for DeepLC calibration on iRT peptides (leads to strange predictions)
- Add tests for modification assignment
- Removed support for: Elude-based RT predictions, RT predictions file, PEPREC filter, saving temporary PEPREC files
  • Loading branch information
RalfG committed Feb 21, 2023
1 parent 3e4935d commit 7630502
Show file tree
Hide file tree
Showing 9 changed files with 874 additions and 532 deletions.
1,062 changes: 591 additions & 471 deletions fasta2speclib/fasta2speclib.py

Large diffs are not rendered by default.

18 changes: 6 additions & 12 deletions fasta2speclib/fasta2speclib_config.json
Original file line number Diff line number Diff line change
@@ -1,25 +1,19 @@
{
"output_filetype":["msp", "mgf", "bibliospec", "spectronaut", "hdf", "dlib"],
"output_filetype":["msp", "mgf", "bibliospec", "dlib"],
"charges":[2, 3],
"min_peplen":8,
"max_peplen":30,
"cleavage_rule":"trypsin",
"missed_cleavages":2,
"modifications":[
{"name":"Glu->pyro-Glu", "unimod_accession":27, "mass_shift":-18.010565, "amino_acid":"E", "n_term":true, "fixed":false},
{"name":"Gln->pyro-Glu", "unimod_accession":28, "mass_shift":-17.026549, "amino_acid":"Q", "n_term":true, "fixed":false},
{"name":"Acetyl", "unimod_accession":1, "mass_shift":42.010565, "amino_acid":null, "n_term":true, "fixed":false},
{"name":"Oxidation", "unimod_accession":35, "mass_shift":15.994915, "amino_acid":"M", "n_term":false, "fixed":false},
{"name":"Carbamidomethyl", "unimod_accession":4, "mass_shift":57.021464, "amino_acid":"C", "n_term":false, "fixed":true}
{"name":"Acetyl", "unimod_accession":1, "mass_shift":42.010565, "amino_acid":null, "protein_n_term":true},
{"name":"Oxidation", "unimod_accession":35, "mass_shift":15.994915, "amino_acid":"M"},
{"name":"Carbamidomethyl", "unimod_accession":4, "mass_shift":57.021464, "amino_acid":"C", "fixed":true}
],
"ms2pip_model":"HCD",
"decoy":true,
"decoy":false,
"add_retention_time":true,
"deeplc": {},
"elude_model_file":null,
"rt_predictions_file":null,
"peprec_filter":null,
"save_peprec":false,
"batch_size":10000,
"num_cpu":24
"num_cpu":null
}
1 change: 1 addition & 0 deletions ms2pip/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from ms2pip.ms2pipC import MS2PIP
61 changes: 39 additions & 22 deletions ms2pip/ms2pipC.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,21 +15,23 @@
from rich.progress import track

from ms2pip.cython_modules import ms2pip_pyx
from ms2pip.exceptions import (FragmentationModelRequiredError,
InvalidModificationFormattingError,
InvalidPEPRECError, MissingConfigurationError,
NoMatchingSpectraFound,
NoValidPeptideSequencesError, TitlePatternError,
UnknownFragmentationMethodError,
UnknownModificationError,
UnknownOutputFormatError)
from ms2pip.exceptions import (
FragmentationModelRequiredError,
InvalidModificationFormattingError,
InvalidPEPRECError,
MissingConfigurationError,
NoMatchingSpectraFound,
NoValidPeptideSequencesError,
TitlePatternError,
UnknownFragmentationMethodError,
UnknownModificationError,
UnknownOutputFormatError,
)
from ms2pip.feature_names import get_feature_names_new
from ms2pip.match_spectra import MatchSpectra
from ms2pip.ms2pip_tools import calc_correlations, spectrum_output
from ms2pip.peptides import (AMINO_ACID_IDS, Modifications,
write_amino_acid_masses)
from ms2pip.predict_xgboost import (get_predictions_xgb,
validate_requested_xgb_model)
from ms2pip.peptides import AMINO_ACID_IDS, Modifications, write_amino_acid_masses
from ms2pip.predict_xgboost import get_predictions_xgb, validate_requested_xgb_model
from ms2pip.retention_time import RetentionTime
from ms2pip.spectrum import read_spectrum_file

Expand Down Expand Up @@ -545,18 +547,23 @@ def apply_mods(peptide, mods, PTMmap):
version of the peptide sequence, c- and n-term modifications. This modified
version are hard coded in ms2pipfeatures_c.c for now.
"""
modpeptide = np.array(peptide[:], dtype=np.uint16)

modpeptide = np.array(peptide[:], dtype=np.uint16) # Copy to avoid inplace changes
if mods != "-":
l = mods.split("|")
if len(l) % 2 != 0:
raise InvalidModificationFormattingError(mods)
for i in range(0, len(l), 2):
tl = l[i + 1]
if tl in PTMmap:
modpeptide[int(l[i])] = PTMmap[tl]
else:
try:
mod = PTMmap[tl]
except:
raise UnknownModificationError(tl)
try:
modpeptide[int(l[i])] = mod
except IndexError:
raise InvalidModificationFormattingError(
f"Amino acid position not in peptide for modifications: `{mods}`"
)

return modpeptide

Expand All @@ -574,7 +581,7 @@ def __init__(
self,
pep_file,
spec_file=None,
spectrum_id_pattern="(.*)",
spectrum_id_pattern=None,
vector_file=None,
num_cpu=1,
params=None,
Expand Down Expand Up @@ -652,7 +659,9 @@ def __init__(
"""
self.pep_file = pep_file
self.vector_file = vector_file
self.spectrum_id_pattern = spectrum_id_pattern
self.spectrum_id_pattern = (
spectrum_id_pattern if spectrum_id_pattern else "(.*)"
)
self.num_cpu = num_cpu
self.params = params
self.return_results = return_results
Expand Down Expand Up @@ -709,12 +718,14 @@ def __init__(
else:
raise UnknownFragmentationMethodError(self.model)

# TODO: Move to run?
if output_filename is None and not return_results:
self.output_filename = "{}_{}".format(
".".join(pep_file.split(".")[:-1]), self.model
)
else:
self.output_filename = output_filename
# TODO

logger.debug(f"Starting workers (num_cpu={self.num_cpu})...")
if multiprocessing.current_process().daemon:
Expand All @@ -728,6 +739,7 @@ def __init__(
else:
self.myPool = multiprocessing.Pool(self.num_cpu)

# TODO: Move to run?
if self.match_spectra:
self.spec_file = None
if self.sqldb_uri:
Expand All @@ -740,24 +752,29 @@ def __init__(
else:
self.spec_file = spec_file
self.spec_files = None
# TODO

self.mods = Modifications()
for mod_type in ("sptm", "ptm"):
self.mods.add_from_ms2pip_modstrings(
self.params["ms2pip"][mod_type], mod_type=mod_type
)

# TODO: Pass PEPREC and SPECFILE args here?
def run(self):
"""Run initiated MS2PIP based on class configuration."""

# TODO: MOVE TO INIT?
self.afile = write_amino_acid_masses()
self.modfile = self.mods.write_modifications_file(mod_type="ptm")
self.modfile2 = self.mods.write_modifications_file(mod_type="sptm")
#

self._read_peptide_information()

if self.add_retention_time:
logger.info("Adding retention time predictions")
rt_predictor = RetentionTime(config=self.params)
rt_predictor = RetentionTime(config=self.params, num_cpu=self.num_cpu)
rt_predictor.add_rt_predictions(self.data)

# Spectrum file mode
Expand Down Expand Up @@ -846,8 +863,8 @@ def _read_peptide_information(self):
)
else:
data = self.pep_file
# for some reason the missing values are converted to float otherwise
data = data.fillna("-")
# Unmodified peptides should have a '-', not NaN or an empty string
data["modifications"] = data["modifications"].replace("", "-").fillna("-")

# Filter PEPREC for unsupported peptides
num_pep = len(data)
Expand Down
7 changes: 2 additions & 5 deletions ms2pip/ms2pip_tools/spectrum_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ def _get_msp_modifications(self, sequence, modifications):
"""

if isinstance(modifications, str):
if modifications == "-":
if not modifications or modifications == "-":
msp_modifications = "0"
else:
mods = modifications.split("|")
Expand Down Expand Up @@ -835,10 +835,7 @@ def write_csv(self):
logger.info("Writing results to %s", f_name)

self.all_preds.to_csv(
file_object,
float_format="%.6g",
index=False,
lineterminator="\n"
file_object, float_format="%.6g", index=False, lineterminator="\n"
)
return file_object

Expand Down
54 changes: 34 additions & 20 deletions ms2pip/retention_time.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
import logging
import os
from pathlib import Path

import pandas as pd

# from deeplc import DeepLC
logger = logging.getLogger(__name__)

# Reduce Tensorflow logging
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"


class RetentionTime:
def __init__(self, predictor="deeplc", config=None):
def __init__(self, predictor="deeplc", config=None, num_cpu=None):
"""
Initialize peptide retention time predictor
Expand All @@ -28,6 +31,13 @@ def __init__(self, predictor="deeplc", config=None):
else:
self.config = config

if "deeplc" not in self.config:
self.config["deeplc"] = {
"verbose": False,
"calibration_file": None,
"n_jobs": num_cpu,
}

def _get_irt_peptides(self):
"""
Return DeepLC DataFrame with iRT peptides
Expand Down Expand Up @@ -57,29 +67,32 @@ def _init_deeplc(self):
"""
Initialize DeepLC: import, configurate and calibrate
"""
# Only import if DeepLC will be used, otherwise lot's of extra heavy
# Only import if DeepLC will be used, otherwise lots of extra heavy
# dependencies (e.g. Tensorflow) are imported as well
from deeplc import DeepLC
import deeplc

if "deeplc" in self.config:
deeplc_params = self.config["deeplc"]
calibration_file = deeplc_params["calibration_file"]
del deeplc_params["calibration_file"]
else:
deeplc_params = {"verbose": False}
calibration_file = None

# TODO: Remove when fixed upstream in DeepLC
if not calibration_file:
deeplc_params["split_cal"] = 9

self.deeplc_predictor = DeepLC(**deeplc_params)

if calibration_file:
cal_df = pd.read_csv(calibration_file, sep=",")
deeplc_params = self.config["deeplc"]
if "calibration_file" in deeplc_params and deeplc_params["calibration_file"]:
cal_df = pd.read_csv(deeplc_params["calibration_file"], sep=",")
else:
cal_df = self._get_irt_peptides()
deeplc_params["split_cal"] = 9 # Only 11 iRT peptides, so use 9 splits
best_model_for_irt = (
Path(deeplc.__file__).parent
/ "mods/full_hc_PXD005573_mcp_cb975cfdd4105f97efa0b3afffe075cc.hdf5"
)
if best_model_for_irt.exists():
deeplc_params["path_model"] = str(best_model_for_irt)

# Remove from deeplc_params to control calibration here instead of in DeepLC
if "calibration_file" in deeplc_params:
del deeplc_params["calibration_file"]

# PyGAM results in strange calibration on limited set of iRT peptides
deeplc_params["pygam_calibration"] = False
self.deeplc_predictor = deeplc.DeepLC(**deeplc_params)

logger.info("Calibrating DeepLC...")
self.deeplc_predictor.calibrate_preds(seq_df=cal_df)

def _prepare_deeplc_peptide_df(self):
Expand All @@ -94,6 +107,7 @@ def _run_deeplc(self):
"""
Run DeepLC
"""
logger.info("Predicting retention times with DeepLC...")
self.deeplc_preds = self.deeplc_predictor.make_preds(
seq_df=self.deeplc_pep_df.fillna("")
)
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ build-backend = "setuptools.build_meta"
python = "^3.7"
biopython = ">=1.74,<2"
numpy = ">=1.16,<2"
pandas = ">=0.24,<2"
pandas = ">=1.5,<2"
pyteomics = ">=3.5,<5"
tqdm = ">=4,<5"
tables = ">=3.4"
Expand All @@ -26,6 +26,7 @@ spectrum_utils = "^0.3.5"
click = ">=7,<9"
lxml = "^4"
rich = ">=13"
pydantic = ">=1.10,<2"

[tool.poetry.dev-dependencies]
cython = "*"
Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
INSTALL_REQUIRES = [
"biopython>=1.74,<2",
"numpy>=1.16,<2",
"pandas>=0.24,<2",
"pandas>=1.5,<2",
"pyteomics>=3.5,<5",
"tqdm>=4,<5",
"tables>=3.4",
Expand All @@ -56,6 +56,7 @@
"spectrum-utils==0.3.5",
"lxml>=4",
"rich>=13",
"pydantic>=1.10,<2",
]
PYTHON_REQUIRES = ">=3.7,<4"

Expand Down
Loading

0 comments on commit 7630502

Please sign in to comment.