Skip to content

Commit

Permalink
Merge pull request #126 from MannLabs/development
Browse files Browse the repository at this point in the history
Release 1.1.2
  • Loading branch information
GeorgWa authored Dec 25, 2023
2 parents d63e1f8 + cb797b9 commit 57e9e51
Show file tree
Hide file tree
Showing 27 changed files with 1,395 additions and 500 deletions.
2 changes: 1 addition & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 1.1.1
current_version = 1.1.2
commit = True
tag = False
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<build>\d+))?
Expand Down
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ The infrastructure package of AlphaX ecosystem for MS proteomics. It was first p

- [AlphaPeptDeep](https://github.com/MannLabs/alphapeptdeep): deep learning framework for proteomics.
- [AlphaRaw](https://github.com/MannLabs/alpharaw): raw data reader for different vendors.
- [AlphaDIA](https://github.com/MannLabs/alphadia): DIA search engine.
- [PeptDeep-HLA](https://github.com/MannLabs/peptdeep-hla): personalized HLA-binding peptide prediction.
- [AlphaViz](https://github.com/MannLabs/alphaviz): visualization for MS-based proteomics.
- [AlphaQuant](https://github.com/MannLabs/alphaquant): quantification for MS-based proteomics.

------------------------------------------------------------------------

Expand Down
2 changes: 1 addition & 1 deletion alphabase/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@


__project__ = "alphabase"
__version__ = "1.1.1"
__version__ = "1.1.2"
__license__ = "Apache"
__description__ = "An infrastructure Python package of the AlphaX ecosystem"
__author__ = "Mann Labs"
Expand Down
61 changes: 42 additions & 19 deletions alphabase/constants/aa.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
import os
import pandas as pd
import numpy as np

from typing import Union, Tuple
import typing

from alphabase.yaml_utils import load_yaml

from alphabase.constants.element import (
calc_mass_from_formula,
MASS_H2O, parse_formula,
reset_elements
)

from alphabase.constants._const import CONST_FILE_FOLDER
Expand All @@ -19,19 +19,34 @@
AA_Formula:dict = load_yaml(
os.path.join(CONST_FILE_FOLDER, 'amino_acid.yaml')
)
#: AA mass array with ASCII code, mass of 'A' is AA_ASCII_MASS[ord('A')]
AA_ASCII_MASS:np.ndarray = np.ones(128)*1e8

#: 128-len AA dataframe
AA_DF:pd.DataFrame = pd.DataFrame()

# AA formula to formula dict of dict. For example: {'K': {'C': n, 'O': m, ...}}
AA_Composition:dict = {}

def replace_atoms(atom_replace_dict:typing.Dict):
for aa, formula in list(AA_Formula.items()):
atom_comp = dict(parse_formula(formula))
for atom_from, atom_to in atom_replace_dict.items():
if atom_from in atom_comp:
atom_comp[atom_to] = atom_comp[atom_from]
del atom_comp[atom_from]
AA_Formula[aa] = "".join([f"{atom}({n})" for atom, n in atom_comp.items()])

def reset_AA_mass()->np.ndarray:
"""AA mass in np.array with shape (128,)"""
AA_ASCII_MASS = np.ones(128)*1e8
global AA_ASCII_MASS
for aa, chem in AA_Formula.items():
AA_ASCII_MASS[ord(aa)] = calc_mass_from_formula(chem)
return AA_ASCII_MASS

#: AA mass array with ASCII code, mass of 'A' is AA_ASCII_MASS[ord('A')]
AA_ASCII_MASS:np.ndarray = reset_AA_mass()
reset_AA_mass()

def reset_AA_df():
global AA_ASCII_MASS
global AA_DF
AA_DF = pd.DataFrame()
AA_DF['aa'] = [chr(aa) for aa in range(len(AA_ASCII_MASS))]
AA_DF['formula'] = ['']*len(AA_ASCII_MASS)
Expand All @@ -42,23 +57,31 @@ def reset_AA_df():
formulas.append(formula)
AA_DF.loc[aa_idxes, 'formula'] = formulas
AA_DF['mass'] = AA_ASCII_MASS
AA_ASCII_MASS = AA_DF.mass.to_numpy()
return AA_DF

#: 128-len AA dataframe
AA_DF:pd.DataFrame = reset_AA_df()

# AA to formula dict of dict. For example: {'K': {'C': n, 'O': m, ...}}
AA_Composition:dict = {}
for aa, formula, mass in AA_DF.values:
AA_Composition[aa] = dict(
parse_formula(formula)
)
reset_AA_df()

def reset_AA_Composition():
global AA_Composition
AA_Composition = {}
for aa, formula, mass in AA_DF.values:
AA_Composition[aa] = dict(
parse_formula(formula)
)
return AA_Composition
reset_AA_Composition()

def reset_AA_atoms(atom_replace_dict:typing.Dict = {}):
reset_elements()
replace_atoms(atom_replace_dict)
reset_AA_mass()
reset_AA_df()
reset_AA_Composition()

def update_an_AA(aa:str, formula:str):
aa_idx = ord(aa)
AA_DF.loc[aa_idx,'formula'] = formula
AA_DF.loc[aa_idx,'mass'] = calc_mass_from_formula(formula)
AA_ASCII_MASS[aa_idx] = calc_mass_from_formula(formula)
AA_DF.loc[aa_idx,'mass'] = AA_ASCII_MASS[aa_idx]
AA_Formula[aa] = formula
AA_Composition[aa] = dict(parse_formula(formula))

Expand Down
35 changes: 26 additions & 9 deletions alphabase/constants/atom.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import numpy as np
import numba
import typing

from alphabase.yaml_utils import load_yaml

Expand Down Expand Up @@ -89,7 +90,25 @@ def truncate_isotope(
MASS_H2O:int = None #raise errors if the value is not reset
MASS_NH3:int = None

def update_atom_infos(new_atom_info:typing.Dict):
"""
Args:
atom_dict (Dict): Example, replacing N with 15N
{"N":
{"abundance": [0.01,0.99]},
{"mass": [14.00307400443, 15.00010889888]},
}
"""
for atom, info in new_atom_info.items():
CHEM_INFO_DICT[atom] = info

reset_elements()

def reset_elements():

global MASS_C, MASS_H, MASS_O, MASS_N
global MASS_H2O, MASS_NH3

for elem, items in CHEM_INFO_DICT.items():
isotopes = np.array(items['abundance'])
masses = np.array(items['mass'])
Expand Down Expand Up @@ -120,6 +139,13 @@ def reset_elements():

CHEM_ISOTOPE_DIST[elem] = _isos[start:end]
CHEM_MONO_IDX[elem] = _mono_idx

MASS_C = CHEM_MONO_MASS['C']
MASS_H = CHEM_MONO_MASS['H']
MASS_N = CHEM_MONO_MASS['N']
MASS_O = CHEM_MONO_MASS['O']
MASS_H2O = CHEM_MONO_MASS['H']*2 + CHEM_MONO_MASS['O']
MASS_NH3 = CHEM_MONO_MASS['H']*3 + CHEM_MONO_MASS['N']

def load_elem_yaml(yaml_file:str):
'''Load built-in or user-defined element yaml file. Default yaml is:
Expand All @@ -129,8 +155,6 @@ def load_elem_yaml(yaml_file:str):
global CHEM_MONO_MASS
global CHEM_ISOTOPE_DIST
global CHEM_MONO_IDX
global MASS_C, MASS_H, MASS_O, MASS_N
global MASS_H2O, MASS_NH3

CHEM_INFO_DICT = load_yaml(yaml_file)

Expand All @@ -146,13 +170,6 @@ def load_elem_yaml(yaml_file:str):
)

reset_elements()

MASS_C = CHEM_MONO_MASS['C']
MASS_H = CHEM_MONO_MASS['H']
MASS_N = CHEM_MONO_MASS['N']
MASS_O = CHEM_MONO_MASS['O']
MASS_H2O = CHEM_MONO_MASS['H']*2 + CHEM_MONO_MASS['O']
MASS_NH3 = CHEM_MONO_MASS['H']*3 + CHEM_MONO_MASS['N']

load_elem_yaml(
os.path.join(CONST_FILE_FOLDER,
Expand Down
26 changes: 19 additions & 7 deletions alphabase/peptide/fragment.py
Original file line number Diff line number Diff line change
Expand Up @@ -588,10 +588,12 @@ def flatten_fragments(
input precursor dataframe which contains the frag_start_idx and frag_stop_idx columns
fragment_mz_df : pd.DataFrame
input fragment mz dataframe of shape (N, T) which contains N * T fragment mzs
input fragment mz dataframe of shape (N, T) which contains N * T fragment mzs.
Fragments with mz==0 will be excluded.
fragment_intensity_df : pd.DataFrame
input fragment mz dataframe of shape (N, T) which contains N * T fragment mzs
input fragment intensity dataframe of shape (N, T) which contains N * T fragment mzs.
Could be empty (len==0) to exclude intensity values.
min_fragment_intensity : float, optional
minimum intensity which should be retained. Defaults to -1
Expand Down Expand Up @@ -758,10 +760,12 @@ def compress_fragment_indices(frag_idx):

def remove_unused_fragments(
precursor_df: pd.DataFrame,
fragment_df_list: Tuple[pd.DataFrame, ...]
fragment_df_list: Tuple[pd.DataFrame, ...],
frag_start_col:str = 'frag_start_idx',
frag_stop_col:str = 'frag_stop_idx',
) -> Tuple[pd.DataFrame, Tuple[pd.DataFrame, ...]]:
"""Removes unused fragments of removed precursors,
reannotates the frag_start_idx and frag_stop_idx
reannotates the `frag_start_col` and `frag_stop_col`
Parameters
----------
Expand All @@ -773,19 +777,27 @@ def remove_unused_fragments(
Multiple fragment dataframes can be provided which will all be sliced in the same way.
This allows to slice both the fragment_mz_df and fragment_intensity_df.
At least one fragment dataframe needs to be provided.
frag_start_col : str, optional
Fragment start idx column in `precursor_df`, such as "frag_start_idx" and "peak_start_idx".
Defaults to "frag_start_idx".
frag_stop_col : str, optional
Fragment stop idx column in `precursor_df`, such as "frag_stop_idx" and "peak_stop_idx".
Defaults to "frag_stop_idx".
Returns
-------
pd.DataFrame, List[pd.DataFrame]
returns the reindexed precursor DataFrame and the sliced fragment DataFrames
"""

precursor_df = precursor_df.sort_values(['frag_start_idx'], ascending=True)
frag_idx = precursor_df[['frag_start_idx','frag_stop_idx']].values
precursor_df = precursor_df.sort_values([frag_start_col], ascending=True)
frag_idx = precursor_df[[frag_start_col,frag_stop_col]].values

new_frag_idx, fragment_pointer = compress_fragment_indices(frag_idx)

precursor_df[['frag_start_idx','frag_stop_idx']] = new_frag_idx
precursor_df[[frag_start_col,frag_stop_col]] = new_frag_idx
precursor_df = precursor_df.sort_index()

output_tuple = []
Expand Down
50 changes: 41 additions & 9 deletions alphabase/peptide/precursor.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pandas as pd
import numpy as np
import numba
import typing
import multiprocessing as mp
from tqdm import tqdm

Expand Down Expand Up @@ -486,10 +487,10 @@ def _count_batchify_df(df_group, mp_batch_size):
def calc_precursor_isotope_mp(
precursor_df:pd.DataFrame,
processes:int=8,
mp_batch_size:int=100000,
mp_batch_size:int=10000,
process_bar=None,
min_right_most_intensity:float=0.2,
min_precursor_num_to_run_mp:int=1000,
min_precursor_num_to_run_mp:int=10000,
)->pd.DataFrame:
"""`calc_precursor_isotope` is not that fast for large dataframes,
so here we use multiprocessing for faster isotope pattern calculation.
Expand Down Expand Up @@ -547,8 +548,9 @@ def calc_precursor_isotope_mp(
def calc_precursor_isotope_intensity(
precursor_df,
max_isotope = 6,
min_right_most_intensity = 0.001
):
min_right_most_intensity = 0.001,
normalize:typing.Literal['mono','sum'] = "sum",
)->pd.DataFrame:
"""Calculate isotope intensity values for precursor_df inplace.
Parameters
Expand Down Expand Up @@ -577,28 +579,57 @@ def calc_precursor_isotope_intensity(

precursor_dist = np.zeros((len(precursor_df), max_isotope), dtype=np.float32)

mono_idxes = np.zeros(len(precursor_df),dtype=np.int32)

for i in range(len(precursor_df)):

row = precursor_df.iloc[i]
dist, mono = isotope_dist.calc_formula_distribution(
get_mod_seq_formula(row['sequence'], row['mods'])
)
dist[dist <= min_right_most_intensity] = 0.
dist = dist / dist.sum()
precursor_dist[i] = dist[:max_isotope]

# mono should be always included in the i_x list
# after clipping max_isotope isotopes
mono_left_half_isotope = max_isotope//2
mono_right_half_isotope = (
mono_left_half_isotope if max_isotope%2==0
else (mono_left_half_isotope+1)
)
if mono < mono_left_half_isotope:
precursor_dist[i] = dist[:max_isotope]
mono_idxes[i] = mono
elif mono + mono_right_half_isotope >= len(dist):
precursor_dist[i] = dist[-max_isotope:]
mono_idxes[i] = max_isotope+mono-len(dist)+1
else:
precursor_dist[i] = dist[
mono-mono_left_half_isotope:
mono+mono_right_half_isotope
]
mono_idxes[i] = mono-mono_left_half_isotope

if normalize == "sum":
precursor_dist /= np.sum(precursor_dist, axis=1, keepdims=True)
else:
precursor_dist /= precursor_dist[
np.arange(len(precursor_dist)), mono_idxes
].reshape(-1,1)

precursor_df[col_names] = precursor_dist
precursor_df["mono_isotope_idx"] = mono_idxes

return precursor_df

def calc_precursor_isotope_intensity_mp(
precursor_df,
max_isotope = 6,
min_right_most_intensity = 0.001,
normalize:typing.Literal['mono','sum'] = "sum",
mp_batch_size = 1000,
mp_process_num = 8,
progress_bar = True
):
progress_bar = True,
)->pd.DataFrame:

"""Calculate isotope intensity values for precursor_df using multiprocessing.
Expand Down Expand Up @@ -639,7 +670,8 @@ def calc_precursor_isotope_intensity_mp(
partial(
calc_precursor_isotope_intensity,
max_isotope=max_isotope,
min_right_most_intensity=min_right_most_intensity
min_right_most_intensity=min_right_most_intensity,
normalize=normalize,
), _batchify_df(df_group, mp_batch_size)
)

Expand Down
2 changes: 1 addition & 1 deletion alphabase/spectral_library/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,7 +326,7 @@ def calc_precursor_isotope_intensity(self,
multiprocessing : bool=True,
max_isotope = 6,
min_right_most_intensity = 0.001,
mp_batch_size = 1000,
mp_batch_size = 10000,
mp_process_num = 8
):
"""
Expand Down
Loading

0 comments on commit 57e9e51

Please sign in to comment.