Completely refactor fasta2speclib; implement new features:

- Support for C-terminal modifications - Differentiate between peptide and protein termini for variable modifications - Allow filtering of peptides based on precursor m/z - Allow semi-specific cleavage - Allow setting of a maximum of variable modifications per peptide - Disable PyGAM for DeepLC calibration on iRT peptides (leads to strange predictions) - Add tests for modification assignment - Removed support for: Elude-based RT predictions, RT predictions file, PEPREC filter, saving temporary PEPREC files
compomics · Feb 21, 2023 · 7630502 · 7630502
1 parent 3e4935d
commit 7630502
Show file tree

Hide file tree

Showing 9 changed files with 874 additions and 532 deletions.
diff --git a/fasta2speclib/fasta2speclib.py b/fasta2speclib/fasta2speclib.py
diff --git a/fasta2speclib/fasta2speclib_config.json b/fasta2speclib/fasta2speclib_config.json
@@ -1,25 +1,19 @@
 {
-    "output_filetype":["msp", "mgf", "bibliospec", "spectronaut", "hdf", "dlib"],
+    "output_filetype":["msp", "mgf", "bibliospec", "dlib"],
     "charges":[2, 3],
     "min_peplen":8,
     "max_peplen":30,
     "cleavage_rule":"trypsin",
     "missed_cleavages":2,
     "modifications":[
-        {"name":"Glu->pyro-Glu", "unimod_accession":27, "mass_shift":-18.010565, "amino_acid":"E", "n_term":true, "fixed":false},
-        {"name":"Gln->pyro-Glu", "unimod_accession":28, "mass_shift":-17.026549, "amino_acid":"Q", "n_term":true, "fixed":false},
-        {"name":"Acetyl", "unimod_accession":1, "mass_shift":42.010565, "amino_acid":null, "n_term":true, "fixed":false},
-        {"name":"Oxidation", "unimod_accession":35, "mass_shift":15.994915, "amino_acid":"M", "n_term":false, "fixed":false},
-        {"name":"Carbamidomethyl", "unimod_accession":4, "mass_shift":57.021464, "amino_acid":"C", "n_term":false, "fixed":true}
+        {"name":"Acetyl", "unimod_accession":1, "mass_shift":42.010565, "amino_acid":null, "protein_n_term":true},
+        {"name":"Oxidation", "unimod_accession":35, "mass_shift":15.994915, "amino_acid":"M"},
+        {"name":"Carbamidomethyl", "unimod_accession":4, "mass_shift":57.021464, "amino_acid":"C", "fixed":true}
     ],
     "ms2pip_model":"HCD",
-    "decoy":true,
+    "decoy":false,
     "add_retention_time":true,
     "deeplc": {},
-    "elude_model_file":null,
-    "rt_predictions_file":null,
-    "peprec_filter":null,
-    "save_peprec":false,
     "batch_size":10000,
-    "num_cpu":24
+    "num_cpu":null
 }
diff --git a/ms2pip/__init__.py b/ms2pip/__init__.py
@@ -0,0 +1 @@
+from ms2pip.ms2pipC import MS2PIP
diff --git a/ms2pip/ms2pipC.py b/ms2pip/ms2pipC.py
@@ -15,21 +15,23 @@
 from rich.progress import track
 
 from ms2pip.cython_modules import ms2pip_pyx
-from ms2pip.exceptions import (FragmentationModelRequiredError,
-                               InvalidModificationFormattingError,
-                               InvalidPEPRECError, MissingConfigurationError,
-                               NoMatchingSpectraFound,
-                               NoValidPeptideSequencesError, TitlePatternError,
-                               UnknownFragmentationMethodError,
-                               UnknownModificationError,
-                               UnknownOutputFormatError)
+from ms2pip.exceptions import (
+    FragmentationModelRequiredError,
+    InvalidModificationFormattingError,
+    InvalidPEPRECError,
+    MissingConfigurationError,
+    NoMatchingSpectraFound,
+    NoValidPeptideSequencesError,
+    TitlePatternError,
+    UnknownFragmentationMethodError,
+    UnknownModificationError,
+    UnknownOutputFormatError,
+)
 from ms2pip.feature_names import get_feature_names_new
 from ms2pip.match_spectra import MatchSpectra
 from ms2pip.ms2pip_tools import calc_correlations, spectrum_output
-from ms2pip.peptides import (AMINO_ACID_IDS, Modifications,
-                             write_amino_acid_masses)
-from ms2pip.predict_xgboost import (get_predictions_xgb,
-                                    validate_requested_xgb_model)
+from ms2pip.peptides import AMINO_ACID_IDS, Modifications, write_amino_acid_masses
+from ms2pip.predict_xgboost import get_predictions_xgb, validate_requested_xgb_model
 from ms2pip.retention_time import RetentionTime
 from ms2pip.spectrum import read_spectrum_file
 
@@ -545,18 +547,23 @@ def apply_mods(peptide, mods, PTMmap):
     version of the peptide sequence, c- and n-term modifications. This modified
     version are hard coded in ms2pipfeatures_c.c for now.
     """
-    modpeptide = np.array(peptide[:], dtype=np.uint16)
-
+    modpeptide = np.array(peptide[:], dtype=np.uint16)  # Copy to avoid inplace changes
     if mods != "-":
         l = mods.split("|")
         if len(l) % 2 != 0:
             raise InvalidModificationFormattingError(mods)
         for i in range(0, len(l), 2):
             tl = l[i + 1]
-            if tl in PTMmap:
-                modpeptide[int(l[i])] = PTMmap[tl]
-            else:
+            try:
+                mod = PTMmap[tl]
+            except:
                 raise UnknownModificationError(tl)
+            try:
+                modpeptide[int(l[i])] = mod
+            except IndexError:
+                raise InvalidModificationFormattingError(
+                    f"Amino acid position not in peptide for modifications: `{mods}`"
+                )
 
     return modpeptide
 
@@ -574,7 +581,7 @@ def __init__(
         self,
         pep_file,
         spec_file=None,
-        spectrum_id_pattern="(.*)",
+        spectrum_id_pattern=None,
         vector_file=None,
         num_cpu=1,
         params=None,
@@ -652,7 +659,9 @@ def __init__(
         """
         self.pep_file = pep_file
         self.vector_file = vector_file
-        self.spectrum_id_pattern = spectrum_id_pattern
+        self.spectrum_id_pattern = (
+            spectrum_id_pattern if spectrum_id_pattern else "(.*)"
+        )
         self.num_cpu = num_cpu
         self.params = params
         self.return_results = return_results
@@ -709,12 +718,14 @@ def __init__(
         else:
             raise UnknownFragmentationMethodError(self.model)
 
+        # TODO: Move to run?
         if output_filename is None and not return_results:
             self.output_filename = "{}_{}".format(
                 ".".join(pep_file.split(".")[:-1]), self.model
             )
         else:
             self.output_filename = output_filename
+        # TODO
 
         logger.debug(f"Starting workers (num_cpu={self.num_cpu})...")
         if multiprocessing.current_process().daemon:
@@ -728,6 +739,7 @@ def __init__(
         else:
             self.myPool = multiprocessing.Pool(self.num_cpu)
 
+        # TODO: Move to run?
         if self.match_spectra:
             self.spec_file = None
             if self.sqldb_uri:
@@ -740,24 +752,29 @@ def __init__(
         else:
             self.spec_file = spec_file
             self.spec_files = None
+        # TODO
 
         self.mods = Modifications()
         for mod_type in ("sptm", "ptm"):
             self.mods.add_from_ms2pip_modstrings(
                 self.params["ms2pip"][mod_type], mod_type=mod_type
             )
 
+    # TODO: Pass PEPREC and SPECFILE args here?
     def run(self):
         """Run initiated MS2PIP based on class configuration."""
+
+        # TODO: MOVE TO INIT?
         self.afile = write_amino_acid_masses()
         self.modfile = self.mods.write_modifications_file(mod_type="ptm")
         self.modfile2 = self.mods.write_modifications_file(mod_type="sptm")
+        #
 
         self._read_peptide_information()
 
         if self.add_retention_time:
             logger.info("Adding retention time predictions")
-            rt_predictor = RetentionTime(config=self.params)
+            rt_predictor = RetentionTime(config=self.params, num_cpu=self.num_cpu)
             rt_predictor.add_rt_predictions(self.data)
 
         # Spectrum file mode
@@ -846,8 +863,8 @@ def _read_peptide_information(self):
             )
         else:
             data = self.pep_file
-        # for some reason the missing values are converted to float otherwise
-        data = data.fillna("-")
+        # Unmodified peptides should have a '-', not NaN or an empty string
+        data["modifications"] = data["modifications"].replace("", "-").fillna("-")
 
         # Filter PEPREC for unsupported peptides
         num_pep = len(data)

diff --git a/ms2pip/ms2pip_tools/spectrum_output.py b/ms2pip/ms2pip_tools/spectrum_output.py
@@ -252,7 +252,7 @@ def _get_msp_modifications(self, sequence, modifications):
         """
 
         if isinstance(modifications, str):
-            if modifications == "-":
+            if not modifications or modifications == "-":
                 msp_modifications = "0"
             else:
                 mods = modifications.split("|")
@@ -835,10 +835,7 @@ def write_csv(self):
             logger.info("Writing results to %s", f_name)
 
         self.all_preds.to_csv(
-            file_object,
-            float_format="%.6g",
-            index=False,
-            lineterminator="\n"
+            file_object, float_format="%.6g", index=False, lineterminator="\n"
         )
         return file_object
 

diff --git a/ms2pip/retention_time.py b/ms2pip/retention_time.py
@@ -1,14 +1,17 @@
+import logging
 import os
+from pathlib import Path
 
 import pandas as pd
 
-# from deeplc import DeepLC
+logger = logging.getLogger(__name__)
 
 # Reduce Tensorflow logging
 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
 
+
 class RetentionTime:
-    def __init__(self, predictor="deeplc", config=None):
+    def __init__(self, predictor="deeplc", config=None, num_cpu=None):
         """
         Initialize peptide retention time predictor
 
@@ -28,6 +31,13 @@ def __init__(self, predictor="deeplc", config=None):
         else:
             self.config = config
 
+        if "deeplc" not in self.config:
+            self.config["deeplc"] = {
+                "verbose": False,
+                "calibration_file": None,
+                "n_jobs": num_cpu,
+            }
+
     def _get_irt_peptides(self):
         """
         Return DeepLC DataFrame with iRT peptides
@@ -57,29 +67,32 @@ def _init_deeplc(self):
         """
         Initialize DeepLC: import, configurate and calibrate
         """
-        # Only import if DeepLC will be used, otherwise lot's of extra heavy
+        # Only import if DeepLC will be used, otherwise lots of extra heavy
         # dependencies (e.g. Tensorflow) are imported as well
-        from deeplc import DeepLC
+        import deeplc
 
-        if "deeplc" in self.config:
-            deeplc_params = self.config["deeplc"]
-            calibration_file = deeplc_params["calibration_file"]
-            del deeplc_params["calibration_file"]
-        else:
-            deeplc_params = {"verbose": False}
-            calibration_file = None
-
-        # TODO: Remove when fixed upstream in DeepLC
-        if not calibration_file:
-            deeplc_params["split_cal"] = 9
-
-        self.deeplc_predictor = DeepLC(**deeplc_params)
-
-        if calibration_file:
-            cal_df = pd.read_csv(calibration_file, sep=",")
+        deeplc_params = self.config["deeplc"]
+        if "calibration_file" in deeplc_params and deeplc_params["calibration_file"]:
+            cal_df = pd.read_csv(deeplc_params["calibration_file"], sep=",")
         else:
             cal_df = self._get_irt_peptides()
+            deeplc_params["split_cal"] = 9  # Only 11 iRT peptides, so use 9 splits
+            best_model_for_irt = (
+                Path(deeplc.__file__).parent
+                / "mods/full_hc_PXD005573_mcp_cb975cfdd4105f97efa0b3afffe075cc.hdf5"
+            )
+            if best_model_for_irt.exists():
+                deeplc_params["path_model"] = str(best_model_for_irt)
+
+        # Remove from deeplc_params to control calibration here instead of in DeepLC
+        if "calibration_file" in deeplc_params:
+            del deeplc_params["calibration_file"]
+
+        # PyGAM results in strange calibration on limited set of iRT peptides
+        deeplc_params["pygam_calibration"] = False
+        self.deeplc_predictor = deeplc.DeepLC(**deeplc_params)
 
+        logger.info("Calibrating DeepLC...")
         self.deeplc_predictor.calibrate_preds(seq_df=cal_df)
 
     def _prepare_deeplc_peptide_df(self):
@@ -94,6 +107,7 @@ def _run_deeplc(self):
         """
         Run DeepLC
         """
+        logger.info("Predicting retention times with DeepLC...")
         self.deeplc_preds = self.deeplc_predictor.make_preds(
             seq_df=self.deeplc_pep_df.fillna("")
         )

diff --git a/pyproject.toml b/pyproject.toml
@@ -12,7 +12,7 @@ build-backend = "setuptools.build_meta"
 python = "^3.7"
 biopython = ">=1.74,<2"
 numpy = ">=1.16,<2"
-pandas = ">=0.24,<2"
+pandas = ">=1.5,<2"
 pyteomics = ">=3.5,<5"
 tqdm = ">=4,<5"
 tables = ">=3.4"
@@ -26,6 +26,7 @@ spectrum_utils = "^0.3.5"
 click = ">=7,<9"
 lxml = "^4"
 rich = ">=13"
+pydantic = ">=1.10,<2"
 
 [tool.poetry.dev-dependencies]
 cython = "*"

diff --git a/setup.py b/setup.py
@@ -44,7 +44,7 @@
 INSTALL_REQUIRES = [
     "biopython>=1.74,<2",
     "numpy>=1.16,<2",
-    "pandas>=0.24,<2",
+    "pandas>=1.5,<2",
     "pyteomics>=3.5,<5",
     "tqdm>=4,<5",
     "tables>=3.4",
@@ -56,6 +56,7 @@
     "spectrum-utils==0.3.5",
     "lxml>=4",
     "rich>=13",
+    "pydantic>=1.10,<2",
 ]
 PYTHON_REQUIRES = ">=3.7,<4"