Skip to content

Commit

Permalink
Merge pull request #248 from MannLabs/refactor_readers_VI
Browse files Browse the repository at this point in the history
Refactor readers vi
  • Loading branch information
mschwoer authored Jan 9, 2025
2 parents d120c64 + 23030f0 commit 0f963aa
Show file tree
Hide file tree
Showing 6 changed files with 81 additions and 71 deletions.
34 changes: 20 additions & 14 deletions alphabase/constants/const_files/psm_reader.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ maxquant:
reader_type: maxquant
rt_unit: minute
fixed_C57: True
mod_seq_columns:
- 'Modified sequence'
column_mapping:
'sequence': 'Sequence'
'charge': 'Charge'
Expand Down Expand Up @@ -171,6 +173,8 @@ diann:
'scan_num': 'MS2.Scan'
'score': 'CScore'
'fdr': 'Q.Value'
mod_seq_columns:
- "Modified.Sequence"
modification_mapping: 'maxquant'

spectronaut_report:
Expand All @@ -185,19 +189,14 @@ spectronaut_report:
'genes': 'PG.Genes'
'uniprot_ids': 'PG.UniProtIds'
'charge': 'charge'
mod_seq_columns:
- 'ModifiedSequence'
modification_mapping: 'maxquant'

spectronaut:
reader_type: spectronaut
rt_unit: irt
fixed_C57: False
mod_seq_columns:
- 'ModifiedPeptide'
- 'ModifiedSequence'
- 'FullUniModPeptideName'
- 'ModifiedPeptideSequence'
- 'LabeledSequence'
- 'FullUniModPeptideName'
column_mapping:
'raw_name': 'ReferenceRun'
'sequence': ['StrippedPeptide','PeptideSequence']
Expand All @@ -209,19 +208,19 @@ spectronaut:
'proteins': ['Protein Name','ProteinId','ProteinID','ProteinName','ProteinGroup','ProteinGroups']
'uniprot_ids': ['UniProtIds','UniProtID','UniprotId']
'genes': ['Genes','Gene','GeneName','GeneNames']
modification_mapping: 'maxquant'

library_reader_base:
reader_type: library_reader_base
rt_unit: irt
fixed_C57: False
mod_seq_columns:
- 'ModifiedPeptideSequence'
- 'ModifiedPeptide'
- 'ModifiedSequence'
- 'FullUniModPeptideName'
- 'ModifiedPeptideSequence'
- 'LabeledSequence'
- 'FullUniModPeptideName'
modification_mapping: 'maxquant'

library_reader_base:
reader_type: library_reader_base
rt_unit: irt
fixed_C57: False
column_mapping:
'raw_name': 'ReferenceRun'
'sequence': ['PeptideSequence', 'StrippedPeptide']
Expand All @@ -240,6 +239,13 @@ library_reader_base:
'fragment_charge' : ['FragmentCharge', 'FragmentIonCharge', 'ProductCharge', 'ProductIonCharge']
'fragment_series': ['FragmentSeriesNumber','FragmentNumber']
'fragment_loss_type': ['FragmentLossType', 'FragmentIonLossType', 'ProductLossType', 'ProductIonLossType']
mod_seq_columns:
- 'ModifiedPeptideSequence'
- 'ModifiedPeptide'
- 'ModifiedSequence'
- 'FullUniModPeptideName'
- 'LabeledSequence'
- 'FullUniModPeptideName'
modification_mapping: 'maxquant'

sage:
Expand Down
21 changes: 8 additions & 13 deletions alphabase/psm_reader/dia_psm_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from alphabase.psm_reader.keys import PsmDfCols
from alphabase.psm_reader.maxquant_reader import MaxQuantReader
from alphabase.psm_reader.psm_reader import psm_reader_provider, psm_reader_yaml
from alphabase.psm_reader.psm_reader import psm_reader_provider


class SpectronautReader(MaxQuantReader):
Expand All @@ -33,9 +33,6 @@ def __init__( # noqa: PLR0913 many arguments in function definition
**kwargs,
):
"""Initialize SpectronautReader."""
if mod_seq_columns is None:
mod_seq_columns = psm_reader_yaml["spectronaut"]["mod_seq_columns"]

super().__init__(
column_mapping=column_mapping,
modification_mapping=modification_mapping,
Expand All @@ -47,14 +44,15 @@ def __init__( # noqa: PLR0913 many arguments in function definition
**kwargs,
)

self.mod_seq_column = "ModifiedPeptide"
self._min_max_rt_norm = True

def _load_file(self, filename: str) -> pd.DataFrame:
csv_sep = self._get_table_delimiter(filename)
df = pd.read_csv(filename, sep=csv_sep, keep_default_na=False)

self._find_mod_seq_column(df)
self.mod_seq_column = self._get_mod_seq_column(
df
) # TODO: this needs to be removed
if "ReferenceRun" in df.columns:
df.drop_duplicates(
["ReferenceRun", self.mod_seq_column, "PrecursorCharge"], inplace=True
Expand Down Expand Up @@ -84,9 +82,6 @@ def __init__( # noqa: PLR0913 many arguments in function definition
**kwargs,
):
"""SWATH or OpenSWATH library, similar to `SpectronautReader`."""
if mod_seq_columns is None:
mod_seq_columns = psm_reader_yaml["spectronaut"]["mod_seq_columns"]

super().__init__(
column_mapping=column_mapping,
modification_mapping=modification_mapping,
Expand Down Expand Up @@ -126,7 +121,6 @@ def __init__( # noqa: PLR0913 many arguments in function definition
**kwargs,
)

self.mod_seq_column = "Modified.Sequence"
self._min_max_rt_norm = False

def _load_file(self, filename: str) -> pd.DataFrame:
Expand Down Expand Up @@ -172,15 +166,16 @@ def __init__( # noqa: PLR0913 many arguments in function definition
**kwargs,
)

self.precursor_column = "EG.PrecursorId"
self.mod_seq_column = "ModifiedSequence"

self.precursor_column = "EG.PrecursorId" # TODO: move to yaml
self._min_max_rt_norm = False

def _load_file(self, filename: str) -> pd.DataFrame:
csv_sep = self._get_table_delimiter(filename)
df = pd.read_csv(filename, sep=csv_sep, keep_default_na=False)

self.mod_seq_column = self._get_mod_seq_column(
df
) # TODO: this needs to be removed
df[[self.mod_seq_column, PsmDfCols.CHARGE]] = df[
self.precursor_column
].str.split(".", expand=True, n=2)
Expand Down
9 changes: 1 addition & 8 deletions alphabase/psm_reader/maxquant_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,23 +177,17 @@ def __init__( # noqa: PLR0913 many arguments in function definition
deprecated
"""
if mod_seq_columns is None:
mod_seq_columns = [
"Modified sequence"
] # TODO: why not take from psm_reader.yaml?

super().__init__(
column_mapping=column_mapping,
modification_mapping=modification_mapping,
fdr=fdr,
keep_decoy=keep_decoy,
rt_unit=rt_unit,
mod_seq_columns=mod_seq_columns,
**kwargs,
)

self.fixed_C57 = fixed_C57
self._mod_seq_columns = mod_seq_columns
self.mod_seq_column = "Modified sequence"

def _translate_decoy(self) -> None:
if PsmDfCols.DECOY in self._psm_df.columns:
Expand All @@ -205,7 +199,6 @@ def _load_file(self, filename: str) -> pd.DataFrame:
csv_sep = self._get_table_delimiter(filename)
df = pd.read_csv(filename, sep=csv_sep, keep_default_na=False)

self._find_mod_seq_column(df)
df = df[~pd.isna(df["Retention time"])]
df.fillna("", inplace=True)

Expand Down
62 changes: 42 additions & 20 deletions alphabase/psm_reader/psm_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,14 +35,15 @@ class PSMReaderBase(ABC):
# the typ of modification mapping to be used
_modification_type: Optional[str] = None

def __init__(
def __init__( # noqa: PLR0913 # too many arguments
self,
*,
column_mapping: Optional[dict] = None,
modification_mapping: Optional[dict] = None,
fdr: float = 0.01,
keep_decoy: bool = False,
rt_unit: str = "minute",
mod_seq_columns: Optional[List[str]] = None,
**kwargs,
):
"""The Base class for all PSMReaders.
Expand All @@ -53,20 +54,20 @@ def __init__(
Parameters
----------
column_mapping : dict, optional
A dict that maps alphabase's columns to other search engine's.
A dict that maps alphabase's columns to those of other search engines'.
If it is None, this dict will be read from psm_reader.yaml key `column_mapping`.
The key of the column_mapping is alphabase's column name, and
the value could be the column name or a list of column names
in other engine's result.
If it is None, this dict will be init by
`self._init_column_mapping`. The dict values could be
either str or list, for example:
in other engine's result, for example:
```
columns_mapping = {
'sequence': 'NakedSequence',
'charge': 'Charge',
'proteins':['Proteins','UniprotIDs'] # list, this reader will automatically detect all of them.
}
```
The first column name in the list will be mapped to the harmonized column names, the rest will be ignored.
Defaults to None.
modification_mapping : dict, optional
Expand Down Expand Up @@ -96,6 +97,12 @@ def __init__(
The unit of RT in the search engine result.
Defaults to 'minute'.
mod_seq_columns : list, optional
The columns to find modified sequences.
The first column name in the list will be used, the rest will be ignored.
By default read from psm_reader_yaml key "mod_seq_columns".
If it is not found there, an empty list is used.
**kwargs: dict
deprecated
Expand Down Expand Up @@ -139,7 +146,11 @@ def __init__(
self._engine_rt_unit = rt_unit
self._min_irt_value = -100
self._max_irt_value = 200
self._mod_seq_columns = []
self._mod_seq_columns = (
mod_seq_columns
if mod_seq_columns is not None
else psm_reader_yaml[self._reader_type].get("mod_seq_columns", [])
)

for key, value in kwargs.items(): # TODO: remove and remove kwargs
warnings.warn(
Expand Down Expand Up @@ -174,12 +185,13 @@ def set_modification_mapping(
"""
self._modification_mapper.set_modification_mapping(modification_mapping)

def _find_mod_seq_column(self, df: pd.DataFrame) -> None: # called in _load_file
def _get_mod_seq_column(self, df: pd.DataFrame) -> Optional[str]:
"""Get the first column from `_mod_seq_columns` that is a column of `df`."""
for mod_seq_col in self._mod_seq_columns:
if mod_seq_col in df.columns:
self.mod_seq_column = mod_seq_col
break
# TODO: warn if there's more
return mod_seq_col
return None
# TODO: warn if there's more

def _read_column_mapping(self) -> Dict[str, str]:
"""Read column mapping from psm_reader yaml file."""
Expand Down Expand Up @@ -218,6 +230,9 @@ def import_file(self, _file: str) -> pd.DataFrame:
"""
origin_df = self._load_file(_file)

self.mod_seq_column = self._get_mod_seq_column(origin_df)

self._psm_df = pd.DataFrame()

if len(origin_df):
Expand Down Expand Up @@ -304,17 +319,24 @@ def _load_file(self, filename: str) -> pd.DataFrame:
"""

def _find_mapped_columns(self, origin_df: pd.DataFrame) -> Dict[str, str]:
def _find_mapped_columns(self, df: pd.DataFrame) -> Dict[str, str]:
"""Determine the mapping of AlphaBase columns to the columns in the given DataFrame.
For each AlphaBase column name, check if the corresponding search engine-specific
name is in the DataFrame columns. If it is, add it to the mapping.
If the searchengine-specific name is a list, use the first column name in the list.
"""
mapped_columns = {}
for col, map_col in self.column_mapping.items():
if isinstance(map_col, str):
if map_col in origin_df.columns:
mapped_columns[col] = map_col
elif isinstance(map_col, (list, tuple)):
for other_col in map_col:
if other_col in origin_df.columns:
mapped_columns[col] = other_col
for col_alphabase, col_other in self.column_mapping.items():
if isinstance(col_other, str):
if col_other in df.columns:
mapped_columns[col_alphabase] = col_other
elif isinstance(col_other, (list, tuple)):
for other_col in col_other:
if other_col in df.columns:
mapped_columns[col_alphabase] = other_col
break
# TODO: warn if there's more
return mapped_columns

def _translate_columns(self, origin_df: pd.DataFrame) -> None:
Expand Down
9 changes: 1 addition & 8 deletions alphabase/spectral_library/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from alphabase.peptide.mobility import mobility_to_ccs_for_df
from alphabase.psm_reader.keys import LibPsmDfCols, PsmDfCols
from alphabase.psm_reader.maxquant_reader import MaxQuantReader
from alphabase.psm_reader.psm_reader import psm_reader_yaml
from alphabase.spectral_library.base import SpecLibBase


Expand Down Expand Up @@ -88,9 +87,6 @@ def __init__( # noqa: PLR0913 many arguments in function definition
deprecated
"""
if mod_seq_columns is None:
mod_seq_columns = psm_reader_yaml["library_reader_base"]["mod_seq_columns"]

SpecLibBase.__init__(
self,
charged_frag_types=charged_frag_types,
Expand Down Expand Up @@ -248,7 +244,7 @@ def _load_file(self, filename: str) -> pd.DataFrame:
"""Load the spectral library from a csv file."""
csv_sep = self._get_table_delimiter(filename)

df = pd.read_csv(
return pd.read_csv(
filename,
sep=csv_sep,
keep_default_na=False,
Expand All @@ -273,9 +269,6 @@ def _load_file(self, filename: str) -> pd.DataFrame:
"null",
],
)
self._find_mod_seq_column(df)

return df

def _post_process(
self,
Expand Down
Loading

0 comments on commit 0f963aa

Please sign in to comment.