Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix double modications #188

Merged
merged 20 commits into from
Jul 17, 2024
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
602 changes: 301 additions & 301 deletions alphabase/constants/const_files/__used_mod.yaml

Large diffs are not rendered by default.

602 changes: 301 additions & 301 deletions alphabase/constants/const_files/modification.tsv

Large diffs are not rendered by default.

16 changes: 8 additions & 8 deletions alphabase/constants/const_files/psm_reader.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ alphapept:
'Phospho@S': 'pS'
'Phospho@T': 'pT'
'Phospho@Y': 'pY'
'Acetyl@Protein N-term': 'a'
'Acetyl@Protein_N-term': 'a'

maxquant:
reader_type: maxquant
Expand Down Expand Up @@ -52,10 +52,10 @@ maxquant:
- 'K(Dimethyl)'
'Dimethyl@R':
- 'R(Dimethyl)'
'Dimethyl@Any N-term':
'Dimethyl@Any_N-term':
- '(Dimethyl)'
'Acetyl@Protein N-term':
- '_(Acetyl (Protein N-term))'
'Acetyl@Protein_N-term':
- '_(Acetyl (Protein_N-term))'
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The actual unimod names contain a space, so we cannot modify the MQ source PTM names

Copy link
Contributor Author

@mschwoer mschwoer Nov 18, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

released in 1.3.0, fixed here for 1.4.2: #241

- '_(ac)'
'Carbamidomethyl@C':
- 'C(Carbamidomethyl (C))'
Expand Down Expand Up @@ -124,10 +124,10 @@ msfragger_pepxml:
- 'Phospho@S' #TY are not needed here
- 'GlyGly@K'
- 'Cysteinyl@C'
- 'Acetyl@Any N-term'
- 'Glu->pyro-Glu@E^Any N-term'
- 'Gln->pyro-Glu@Q^Any N-term'
- 'Dimethyl@K' # Any N-term is not needed here as it will be infered in-the-fly
- 'Acetyl@Any_N-term'
- 'Glu->pyro-Glu@E^Any_N-term'
- 'Gln->pyro-Glu@Q^Any_N-term'
- 'Dimethyl@K' # Any_N-term is not needed here as it will be infered in-the-fly
- 'Methyl@E' #an example of a PTM that can be C-term
mod_mass_tol: 0.1 # Da

Expand Down
16 changes: 10 additions & 6 deletions alphabase/constants/modification.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,14 +90,18 @@ def load_mod_df(
):
global MOD_DF
MOD_DF = pd.read_table(tsv, keep_default_na=False)
_df = MOD_DF[MOD_DF.mod_name.str.contains(" ", regex=False)].copy()
_df["mod_name"] = MOD_DF.mod_name.str.replace(" ", "_", regex=False)
MOD_DF = pd.concat([MOD_DF, _df], ignore_index=True).drop_duplicates("mod_name")

if any(mask := MOD_DF["mod_name"].str.contains(" ", regex=False)):
raise ValueError(
f"Modification names must not contain spaces: {MOD_DF[mask]['mod_name'].values}"
)

MOD_DF.drop_duplicates("mod_name", inplace=True)
MOD_DF.fillna("", inplace=True)
MOD_DF["unimod_id"] = MOD_DF.unimod_id.astype(np.int32)
MOD_DF["unimod_id"] = MOD_DF["unimod_id"].astype(np.int32)
MOD_DF.set_index("mod_name", drop=False, inplace=True)
MOD_DF["mass"] = MOD_DF.composition.apply(calc_mass_from_formula)
MOD_DF["modloss_original"] = MOD_DF.modloss_composition.apply(
MOD_DF["mass"] = MOD_DF["composition"].apply(calc_mass_from_formula)
MOD_DF["modloss_original"] = MOD_DF["modloss_composition"].apply(
calc_mass_from_formula
)
MOD_DF["modloss"] = MOD_DF["modloss_original"]
Expand Down
2 changes: 1 addition & 1 deletion alphabase/peptide/precursor.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,7 @@ def hash_precursor_df(precursor_df: pd.DataFrame, *, seed: int = 0) -> pd.DataFr

def get_mod_seq_formula(seq: str, mods: str) -> list:
"""
'PEPTIDE','Acetyl@Any N-term' --> [('C',n), ('H',m), ...]
'PEPTIDE','Acetyl@Any_N-term' --> [('C',n), ('H',m), ...]
"""
formula = {}
for aa in seq:
Expand Down
12 changes: 6 additions & 6 deletions alphabase/protein/fasta.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,9 +468,9 @@ def parse_labels(labels: list):
if len(aa) == 1:
label_aas += aa
label_mod_dict[aa] = label
elif aa == "Any N-term" or aa == "Any_N-term":
elif aa == "Any_N-term":
nterm_label_mod = label
elif aa == "Any C-term" or aa == "Any_C-term":
elif aa == "Any_C-term":
cterm_label_mod = label
return label_aas, label_mod_dict, nterm_label_mod, cterm_label_mod

Expand Down Expand Up @@ -815,13 +815,13 @@ def _set_dict(term_dict, site, mod, allow_conflicts):
term_dict[site] = term_mod

site, term = parse_term_mod(term_mod)
if term == "Any N-term" or term == "Any_N-term":
if term == "Any_N-term":
_set_dict(pep_nterm, site, term_mod, allow_conflicts)
elif term == "Protein N-term" or term == "Protein_N-term":
elif term == "Protein_N-term":
_set_dict(prot_nterm, site, term_mod, allow_conflicts)
elif term == "Any C-term" or term == "Any_C-term":
elif term == "Any_C-term":
_set_dict(pep_cterm, site, term_mod, allow_conflicts)
elif term == "Protein C-term" or term == "Protein_C-term":
elif term == "Protein_C-term":
_set_dict(prot_cterm, site, term_mod, allow_conflicts)

# for mod in self.fix_mods:
Expand Down
6 changes: 3 additions & 3 deletions alphabase/psm_reader/msfragger_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,9 @@ def _get_mods_from_masses(sequence, msf_aa_mods):
for mod_name in mass_mapped_mods:
if abs(mod_mass - MOD_MASS[mod_name]) < mod_mass_tol:
if site == 0:
_mod = mod_name.split("@")[0] + "@Any N-term"
_mod = mod_name.split("@")[0] + "@Any_N-term"
elif site == 1:
if mod_name.endswith("^Any N-term"):
if mod_name.endswith("^Any_N-term"):
_mod = mod_name
site_str = "0"
else:
Expand All @@ -58,7 +58,7 @@ def _get_mods_from_masses(sequence, msf_aa_mods):
_mod = mod_name
else:
_mod = (
mod_name.split("@")[0] + "@Any C-term"
mod_name.split("@")[0] + "@Any_C-term"
) # what if only Protein C-term is listed?
site_str = "-1"
else:
Expand Down
16 changes: 8 additions & 8 deletions alphabase/psm_reader/pfind_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,21 +23,21 @@ def convert_one_pFind_mod(mod):
if len(site) == 1:
return name + "@" + site
elif site == "AnyN-term":
return name + "@" + "Any N-term"
return name + "@" + "Any_N-term"
elif site == "ProteinN-term":
return name + "@" + "Protein N-term"
return name + "@" + "Protein_N-term"
elif site.startswith("AnyN-term"):
return name + "@" + site[-1] + "^Any N-term"
return name + "@" + site[-1] + "^Any_N-term"
elif site.startswith("ProteinN-term"):
return name + "@" + site[-1] + "^Protein N-term"
return name + "@" + site[-1] + "^Protein_N-term"
elif site == "AnyC-term":
return name + "@" + "Any C-term"
return name + "@" + "Any_C-term"
elif site == "ProteinC-term":
return name + "@" + "Protein C-term"
return name + "@" + "Protein_C-term"
elif site.startswith("AnyC-term"):
return name + "@" + site[-1] + "^Any C-term"
return name + "@" + site[-1] + "^Any_C-term"
elif site.startswith("ProteinC-term"):
return name + "@" + site[-1] + "^Protein C-term"
return name + "@" + site[-1] + "^Protein_C-term"
else:
return None

Expand Down
6 changes: 3 additions & 3 deletions alphabase/psm_reader/psm_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ def add_modification_mapping(self, modification_mapping: dict):
```
add_modification_mapping({
'Dimethyl@K': ['K(Dimethyl)'], # list
'Dimethyl@Any N-term': '_(Dimethyl)', # str
'Dimethyl@Any_N-term': '_(Dimethyl)', # str
})
```
"""
Expand Down Expand Up @@ -241,7 +241,7 @@ def _reverse_mod_mapping(self):
if isinstance(other_mod, (list, tuple)):
for _mod in other_mod:
if _mod in self.rev_mod_mapping:
if this_mod.endswith("Protein N-term"):
if this_mod.endswith("Protein_N-term"):
continue
self.rev_mod_mapping[_mod] = this_mod
else:
Expand Down Expand Up @@ -516,7 +516,7 @@ def filter_psm_by_modifications(
"Phospho@S",
"Phospho@T",
"Phospho@Y",
"Acetyl@Protein N-term",
"Acetyl@Protein_N-term",
]
),
):
Expand Down
Loading
Loading