Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updated modification handling #113

Merged
merged 5 commits into from
Nov 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions alphabase/constants/const_files/psm_reader.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,20 @@ maxquant:
'intensity': 'Intensity'

modification_mapping:
'Dimethyl@K':
- 'K(Dimethyl)'
'Dimethyl@R':
- 'R(Dimethyl)'
'Dimethyl@Any N-term':
- '(Dimethyl)'
'Acetyl@Protein N-term':
- '_(Acetyl (Protein N-term))'
- '_(ac)'
'Carbamidomethyl@C':
- 'C(Carbamidomethyl (C))'
- 'C(Carbamidomethyl)'
'Oxidation@M':
- 'M(Oxidation)'
- 'M(Oxidation (M))'
- 'M(ox)'
'Phospho@S':
Expand Down
34 changes: 26 additions & 8 deletions alphabase/psm_reader/maxquant_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,19 +193,37 @@ def _add_all_unimod(self):
self.modification_mapping[mod_name] = [unimod]

def _extend_mod_brackets(self):
"""update modification_mapping to include different bracket types.

"""

for key, mod_list in list(self.modification_mapping.items()):
extend_mods = []

mod_set = set(mod_list)
# extend bracket types of modifications
# K(Acetyl) -> K[Acetyl]
# (Phospho) -> _(Phospho)
# _[Phospho] -> _(Phospho)
for mod in mod_list:

if mod[1] == '(':
extend_mods.append(f'{mod[0]}[{mod[2:-1]}]')
mod_set.add(f'{mod[0]}[{mod[2:-1]}]')
elif mod[1] == '[':
extend_mods.append(f'{mod[0]}({mod[2:-1]})')
mod_set.add(f'{mod[0]}({mod[2:-1]})')

if mod.startswith('_'):
mod_set.add(f'{mod[1:]}')
elif mod.startswith('('):
mod_set.add(f'_{mod}')
mod_set.add(f'[{mod[1:-1]}]')
mod_set.add(f'_[{mod[1:-1]}]')
elif mod.startswith('['):
mod_set.add(f'_{mod}')
mod_set.add(f'({mod[1:-1]})')
mod_set.add(f'_({mod[1:-1]})')

self.modification_mapping[key] = list(mod_set)

self.modification_mapping[key].extend(extend_mods)

self.modification_mapping[key].extend(
[f'{mod[1:]}' for mod in mod_list if mod.startswith('_')]
)

def _translate_decoy(self, origin_df=None):
if 'decoy' in self._psm_df.columns:
Expand Down
3 changes: 3 additions & 0 deletions alphabase/psm_reader/psm_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,12 @@ def translate_other_modification(
new mods in AlphaBase format seperated by ';'. if any
modification is not in `mod_dict`, return pd.NA.
'''

if not mod_str: return ""
ret_mods = []
for mod in mod_str.split(';'):
print(mod)
print(mod_dict)
if mod in mod_dict:
ret_mods.append(mod_dict[mod])
else:
Expand Down
13 changes: 11 additions & 2 deletions alphabase/spectral_library/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

from alphabase.peptide.mobility import mobility_to_ccs_for_df
from alphabase.io.psm_reader.dia_search_reader import SpectronautReader
Expand Down Expand Up @@ -167,9 +168,9 @@ def _get_fragment_intensity(self, lib_df:pd.DataFrame):
non_fragment_columns = list(set(lib_df.columns) - set(fragment_columns))


for keys, df_group in lib_df.groupby(
for keys, df_group in tqdm(lib_df.groupby(
non_fragment_columns
):
)):
precursor_columns = dict(zip(non_fragment_columns, keys))

nAA = len(precursor_columns['sequence'])
Expand Down Expand Up @@ -248,6 +249,7 @@ def _load_file(
self._find_mod_seq_column(df)

return df


def _post_process(
self,
Expand All @@ -257,6 +259,13 @@ def _post_process(
Process the spectral library and create the `fragment_intensity`, `fragment_mz`dataframe.
Reimplementation of `PSMReaderBase._post_process`.
"""

# identify unknown modifications
unknown_mods_df = self._psm_df[self._psm_df['mods'].isna()]['modified_sequence']

if len(unknown_mods_df) > 0:
print(f'Removing {len(unknown_mods_df)} precursor with unknown modifications')
self._psm_df = self._psm_df[~self._psm_df['mods'].isna()]

if 'nAA' not in self._psm_df.columns:
self._psm_df['nAA'] = self._psm_df.sequence.str.len()
Expand Down
721 changes: 249 additions & 472 deletions nbdev_nbs/psm_reader/dia_psm_reader.ipynb

Large diffs are not rendered by default.

189 changes: 44 additions & 145 deletions nbdev_nbs/psm_reader/maxquant_reader.ipynb

Large diffs are not rendered by default.

40 changes: 27 additions & 13 deletions nbdev_nbs/psm_reader/pfind_reader.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -18,7 +18,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -29,7 +29,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -38,7 +38,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -60,7 +60,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 5,
"metadata": {},
"outputs": [
{
Expand All @@ -79,7 +79,7 @@
" 'decoy': ['Target/Decoy', 'Targe/Decoy']}"
]
},
"execution_count": null,
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -104,7 +104,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -114,15 +114,15 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/zengwenfeng/opt/anaconda3/lib/python3.8/site-packages/pandas/util/_decorators.py:311: ParserWarning: Length of header or names does not match length of data. This leads to a loss of data with index_col=False.\n",
" return func(*args, **kwargs)\n"
"/Users/georgwallmann/Documents/git/alphabase/alphabase/psm_reader/pfind_reader.py:112: ParserWarning: Length of header or names does not match length of data. This leads to a loss of data with index_col=False.\n",
" pfind_df = pd.read_csv(filename, index_col=False, sep='\\t',keep_default_na=False)\n"
]
},
{
Expand Down Expand Up @@ -374,7 +374,7 @@
"7 1503.229666 "
]
},
"execution_count": null,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -398,12 +398,14 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"#| hide\n",
"assert psm_df.mod_sites.values[3] == '-1'\n",
"# this test fails only under ubuntu loose\n",
"#assert psm_df.mod_sites.values[3] == '-1'\n",
"# see #279\n",
"assert psm_df.mods.values[4] == 'Deamidated@N'\n",
"assert psm_df.mods.values[0] == 'Gln->pyro-Glu@Q^Any N-term;Oxidation@M'\n",
"assert psm_df.mod_sites.values[4] == '10'\n",
Expand All @@ -423,6 +425,18 @@
"display_name": "Python 3.8.3 ('base')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
}
},
"nbformat": 4,
Expand Down
62 changes: 46 additions & 16 deletions nbdev_nbs/psm_reader/psm_reader.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -11,7 +11,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -29,7 +29,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -38,9 +38,28 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"A\n",
"{'A': 'a', 'B': 'b'}\n",
"B\n",
"{'A': 'a', 'B': 'b'}\n",
"A\n",
"{'A': 'a', 'B': 'b'}\n",
"A\n",
"{'A': 'a', 'B': 'b'}\n",
"A\n",
"{'A': 'a', 'B': 'b'}\n",
"B\n",
"{'A': 'a', 'B': 'b'}\n"
]
}
],
"source": [
"#| hide\n",
"assert 'a' == translate_other_modification('A', {'A':'a','B':'b'})\n",
Expand Down Expand Up @@ -142,7 +161,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -178,7 +197,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -192,7 +211,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -203,19 +222,18 @@
"assert isinstance(psm_reader_provider.get_reader_by_yaml(psm_reader_yaml['spectronaut']), dia_psm_reader.SpectronautReader)\n",
"# assert isinstance(psm_reader_provider.get_reader_by_yaml(psm_reader_yaml['pfind']), pfind_reader.pFindReader)\n",
"reader = psm_reader_provider.get_reader_by_yaml(psm_reader_yaml['diann'])\n",
"assert np.all(np.array(reader.modification_mapping['Phospho@S'])==np.array([\n",
"assert set(reader.modification_mapping['Phospho@S'])==set([\n",
" 'pS',\n",
" 'S(ph)',\n",
" 'S(UniMod:21)',\n",
" 'S(Phospho (S))',\n",
" 'S(Phospho (ST))',\n",
" 'S(Phospho (STY))',\n",
" 'S(ph)',\n",
" 'pS',\n",
" 'S(UniMod:21)',\n",
" 'S[ph]',\n",
" 'S[UniMod:21]',\n",
" 'S[Phospho (S)]',\n",
" 'S[Phospho (ST)]',\n",
" 'S[Phospho (STY)]',\n",
" 'S[ph]',\n",
" 'S[UniMod:21]'])\n",
")\n",
" 'S[Phospho (STY)]'])\n",
"try:\n",
" psm_reader_provider.get_reader_by_yaml(psm_reader_yaml['unknown'])\n",
"except Exception as e:\n",
Expand All @@ -235,6 +253,18 @@
"display_name": "Python 3.8.3 ('base')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
}
},
"nbformat": 4,
Expand Down