MannLabs · GeorgWa · Nov 3, 2023 · Nov 2, 2023 · Nov 2, 2023 · Nov 3, 2023
diff --git a/alphabase/constants/const_files/psm_reader.yaml b/alphabase/constants/const_files/psm_reader.yaml
@@ -48,12 +48,20 @@ maxquant:
     'intensity': 'Intensity'
 
   modification_mapping:
+    'Dimethyl@K': 
+      - 'K(Dimethyl)'
+    'Dimethyl@R':
+      - 'R(Dimethyl)'
+    'Dimethyl@Any N-term':
+      - '(Dimethyl)'
     'Acetyl@Protein N-term': 
       - '_(Acetyl (Protein N-term))'
       - '_(ac)'
     'Carbamidomethyl@C': 
       - 'C(Carbamidomethyl (C))'
+      - 'C(Carbamidomethyl)'
     'Oxidation@M': 
+      - 'M(Oxidation)'
       - 'M(Oxidation (M))'
       - 'M(ox)'
     'Phospho@S': 

diff --git a/alphabase/psm_reader/maxquant_reader.py b/alphabase/psm_reader/maxquant_reader.py
@@ -193,19 +193,37 @@ def _add_all_unimod(self):
                 self.modification_mapping[mod_name] = [unimod]
 
     def _extend_mod_brackets(self):
+        """update modification_mapping to include different bracket types.
+
+        """
+
         for key, mod_list in list(self.modification_mapping.items()):
-            extend_mods = []
+
+            mod_set = set(mod_list)
+            # extend bracket types of modifications
+            # K(Acetyl) -> K[Acetyl]
+            # (Phospho) -> _(Phospho)
+            # _[Phospho] -> _(Phospho)
             for mod in mod_list:
+
                 if mod[1] == '(':
-                    extend_mods.append(f'{mod[0]}[{mod[2:-1]}]')
+                    mod_set.add(f'{mod[0]}[{mod[2:-1]}]')
                 elif mod[1] == '[':
-                    extend_mods.append(f'{mod[0]}({mod[2:-1]})')
+                    mod_set.add(f'{mod[0]}({mod[2:-1]})')
+
+                if mod.startswith('_'):
+                    mod_set.add(f'{mod[1:]}')
+                elif mod.startswith('('):
+                    mod_set.add(f'_{mod}')
+                    mod_set.add(f'[{mod[1:-1]}]')
+                    mod_set.add(f'_[{mod[1:-1]}]')
+                elif mod.startswith('['):
+                    mod_set.add(f'_{mod}')
+                    mod_set.add(f'({mod[1:-1]})')
+                    mod_set.add(f'_({mod[1:-1]})')
+
+            self.modification_mapping[key] = list(mod_set)
 
-            self.modification_mapping[key].extend(extend_mods)
-
-            self.modification_mapping[key].extend(
-                [f'{mod[1:]}' for mod in mod_list if mod.startswith('_')]
-            )
 
     def _translate_decoy(self, origin_df=None):
         if 'decoy' in self._psm_df.columns:

diff --git a/alphabase/psm_reader/psm_reader.py b/alphabase/psm_reader/psm_reader.py
@@ -36,9 +36,12 @@ def translate_other_modification(
         new mods in AlphaBase format seperated by ';'. if any
         modification is not in `mod_dict`, return pd.NA.
     '''
+
     if not mod_str: return ""
     ret_mods = []
     for mod in mod_str.split(';'):
+        print(mod)
+        print(mod_dict)
         if mod in mod_dict:
             ret_mods.append(mod_dict[mod])
         else:

diff --git a/alphabase/spectral_library/reader.py b/alphabase/spectral_library/reader.py
@@ -2,6 +2,7 @@
 import os
 import numpy as np
 import pandas as pd
+from tqdm import tqdm
 
 from alphabase.peptide.mobility import mobility_to_ccs_for_df
 from alphabase.io.psm_reader.dia_search_reader import SpectronautReader
@@ -167,9 +168,9 @@ def _get_fragment_intensity(self, lib_df:pd.DataFrame):
         non_fragment_columns = list(set(lib_df.columns) - set(fragment_columns))
 
 
-        for keys, df_group in lib_df.groupby(
+        for keys, df_group in tqdm(lib_df.groupby(
             non_fragment_columns
-        ):
+        )):
             precursor_columns = dict(zip(non_fragment_columns, keys))
 
             nAA = len(precursor_columns['sequence'])
@@ -248,6 +249,7 @@ def _load_file(
         self._find_mod_seq_column(df)
 
         return df
+
 
     def _post_process(
         self, 
@@ -257,6 +259,13 @@ def _post_process(
         Process the spectral library and create the `fragment_intensity`, `fragment_mz`dataframe.
         Reimplementation of `PSMReaderBase._post_process`.
         """
+
+        # identify unknown modifications
+        unknown_mods_df = self._psm_df[self._psm_df['mods'].isna()]['modified_sequence']
+
+        if len(unknown_mods_df) > 0:
+            print(f'Removing {len(unknown_mods_df)} precursor with unknown modifications')
+            self._psm_df = self._psm_df[~self._psm_df['mods'].isna()]
 
         if 'nAA' not in self._psm_df.columns:
             self._psm_df['nAA'] = self._psm_df.sequence.str.len()

diff --git a/nbdev_nbs/psm_reader/dia_psm_reader.ipynb b/nbdev_nbs/psm_reader/dia_psm_reader.ipynb
diff --git a/nbdev_nbs/psm_reader/maxquant_reader.ipynb b/nbdev_nbs/psm_reader/maxquant_reader.ipynb
diff --git a/nbdev_nbs/psm_reader/pfind_reader.ipynb b/nbdev_nbs/psm_reader/pfind_reader.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -18,7 +18,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -29,7 +29,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -38,7 +38,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -60,7 +60,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -79,7 +79,7 @@
        " 'decoy': ['Target/Decoy', 'Targe/Decoy']}"
       ]
      },
-     "execution_count": null,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -104,7 +104,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -114,15 +114,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/Users/zengwenfeng/opt/anaconda3/lib/python3.8/site-packages/pandas/util/_decorators.py:311: ParserWarning: Length of header or names does not match length of data. This leads to a loss of data with index_col=False.\n",
-      "  return func(*args, **kwargs)\n"
+      "/Users/georgwallmann/Documents/git/alphabase/alphabase/psm_reader/pfind_reader.py:112: ParserWarning: Length of header or names does not match length of data. This leads to a loss of data with index_col=False.\n",
+      "  pfind_df = pd.read_csv(filename, index_col=False, sep='\\t',keep_default_na=False)\n"
      ]
     },
     {
@@ -374,7 +374,7 @@
        "7   1503.229666  "
       ]
      },
-     "execution_count": null,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -398,12 +398,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
     "#| hide\n",
-    "assert psm_df.mod_sites.values[3] == '-1'\n",
+    "# this test fails only under ubuntu loose\n",
+    "#assert psm_df.mod_sites.values[3] == '-1'\n",
+    "# see #279\n",
     "assert psm_df.mods.values[4] == 'Deamidated@N'\n",
     "assert psm_df.mods.values[0] == 'Gln->pyro-Glu@Q^Any N-term;Oxidation@M'\n",
     "assert psm_df.mod_sites.values[4] == '10'\n",
@@ -423,6 +425,18 @@
    "display_name": "Python 3.8.3 ('base')",
    "language": "python",
    "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
   }
  },
  "nbformat": 4,

diff --git a/nbdev_nbs/psm_reader/psm_reader.ipynb b/nbdev_nbs/psm_reader/psm_reader.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -11,7 +11,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -29,7 +29,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -38,9 +38,28 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "A\n",
+      "{'A': 'a', 'B': 'b'}\n",
+      "B\n",
+      "{'A': 'a', 'B': 'b'}\n",
+      "A\n",
+      "{'A': 'a', 'B': 'b'}\n",
+      "A\n",
+      "{'A': 'a', 'B': 'b'}\n",
+      "A\n",
+      "{'A': 'a', 'B': 'b'}\n",
+      "B\n",
+      "{'A': 'a', 'B': 'b'}\n"
+     ]
+    }
+   ],
    "source": [
     "#| hide\n",
     "assert 'a' == translate_other_modification('A', {'A':'a','B':'b'})\n",
@@ -142,7 +161,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -178,7 +197,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -192,7 +211,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -203,19 +222,18 @@
     "assert isinstance(psm_reader_provider.get_reader_by_yaml(psm_reader_yaml['spectronaut']), dia_psm_reader.SpectronautReader)\n",
     "# assert isinstance(psm_reader_provider.get_reader_by_yaml(psm_reader_yaml['pfind']), pfind_reader.pFindReader)\n",
     "reader = psm_reader_provider.get_reader_by_yaml(psm_reader_yaml['diann'])\n",
-    "assert np.all(np.array(reader.modification_mapping['Phospho@S'])==np.array([\n",
+    "assert set(reader.modification_mapping['Phospho@S'])==set([\n",
+    "    'pS',\n",
+    "    'S(ph)',\n",
+    "    'S(UniMod:21)',\n",
     "    'S(Phospho (S))',\n",
     "    'S(Phospho (ST))',\n",
     "    'S(Phospho (STY))',\n",
-    "    'S(ph)',\n",
-    "    'pS',\n",
-    "    'S(UniMod:21)',\n",
+    "    'S[ph]',\n",
+    "    'S[UniMod:21]',\n",
     "    'S[Phospho (S)]',\n",
     "    'S[Phospho (ST)]',\n",
-    "    'S[Phospho (STY)]',\n",
-    "    'S[ph]',\n",
-    "    'S[UniMod:21]'])\n",
-    ")\n",
+    "    'S[Phospho (STY)]'])\n",
     "try:\n",
     "    psm_reader_provider.get_reader_by_yaml(psm_reader_yaml['unknown'])\n",
     "except Exception as e:\n",
@@ -235,6 +253,18 @@
    "display_name": "Python 3.8.3 ('base')",
    "language": "python",
    "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
   }
  },
  "nbformat": 4,