Merge pull request #248 from MannLabs/refactor_readers_VI

Refactor readers vi
MannLabs · Jan 9, 2025 · 0f963aa · 0f963aa
2 parents d120c64 + 23030f0
commit 0f963aa
Show file tree

Hide file tree

Showing 6 changed files with 81 additions and 71 deletions.
diff --git a/alphabase/constants/const_files/psm_reader.yaml b/alphabase/constants/const_files/psm_reader.yaml
@@ -25,6 +25,8 @@ maxquant:
   reader_type: maxquant
   rt_unit: minute
   fixed_C57: True
+  mod_seq_columns:
+    - 'Modified sequence'
   column_mapping:
     'sequence': 'Sequence'
     'charge': 'Charge'
@@ -171,6 +173,8 @@ diann:
     'scan_num': 'MS2.Scan'
     'score': 'CScore'
     'fdr': 'Q.Value'
+  mod_seq_columns:
+    - "Modified.Sequence"
   modification_mapping: 'maxquant'
 
 spectronaut_report:
@@ -185,19 +189,14 @@ spectronaut_report:
     'genes': 'PG.Genes'
     'uniprot_ids': 'PG.UniProtIds'
     'charge': 'charge'
+  mod_seq_columns:
+    - 'ModifiedSequence'
   modification_mapping: 'maxquant'
 
 spectronaut:
   reader_type: spectronaut
   rt_unit: irt
   fixed_C57: False
-  mod_seq_columns:
-    - 'ModifiedPeptide'
-    - 'ModifiedSequence'
-    - 'FullUniModPeptideName'
-    - 'ModifiedPeptideSequence'
-    - 'LabeledSequence'
-    - 'FullUniModPeptideName'
   column_mapping:
     'raw_name': 'ReferenceRun'
     'sequence': ['StrippedPeptide','PeptideSequence']
@@ -209,19 +208,19 @@ spectronaut:
     'proteins': ['Protein Name','ProteinId','ProteinID','ProteinName','ProteinGroup','ProteinGroups']
     'uniprot_ids': ['UniProtIds','UniProtID','UniprotId']
     'genes': ['Genes','Gene','GeneName','GeneNames']
-  modification_mapping: 'maxquant'
-
-library_reader_base:
-  reader_type: library_reader_base
-  rt_unit: irt
-  fixed_C57: False
   mod_seq_columns:
-    - 'ModifiedPeptideSequence'
     - 'ModifiedPeptide'
     - 'ModifiedSequence'
     - 'FullUniModPeptideName'
+    - 'ModifiedPeptideSequence'
     - 'LabeledSequence'
     - 'FullUniModPeptideName'
+  modification_mapping: 'maxquant'
+
+library_reader_base:
+  reader_type: library_reader_base
+  rt_unit: irt
+  fixed_C57: False
   column_mapping:
     'raw_name': 'ReferenceRun'
     'sequence': ['PeptideSequence', 'StrippedPeptide']
@@ -240,6 +239,13 @@ library_reader_base:
     'fragment_charge' : ['FragmentCharge', 'FragmentIonCharge', 'ProductCharge', 'ProductIonCharge']
     'fragment_series': ['FragmentSeriesNumber','FragmentNumber']
     'fragment_loss_type': ['FragmentLossType', 'FragmentIonLossType', 'ProductLossType', 'ProductIonLossType']
+  mod_seq_columns:
+    - 'ModifiedPeptideSequence'
+    - 'ModifiedPeptide'
+    - 'ModifiedSequence'
+    - 'FullUniModPeptideName'
+    - 'LabeledSequence'
+    - 'FullUniModPeptideName'
   modification_mapping: 'maxquant'
 
 sage:

diff --git a/alphabase/psm_reader/dia_psm_reader.py b/alphabase/psm_reader/dia_psm_reader.py
@@ -7,7 +7,7 @@
 
 from alphabase.psm_reader.keys import PsmDfCols
 from alphabase.psm_reader.maxquant_reader import MaxQuantReader
-from alphabase.psm_reader.psm_reader import psm_reader_provider, psm_reader_yaml
+from alphabase.psm_reader.psm_reader import psm_reader_provider
 
 
 class SpectronautReader(MaxQuantReader):
@@ -33,9 +33,6 @@ def __init__(  # noqa: PLR0913 many arguments in function definition
         **kwargs,
     ):
         """Initialize SpectronautReader."""
-        if mod_seq_columns is None:
-            mod_seq_columns = psm_reader_yaml["spectronaut"]["mod_seq_columns"]
-
         super().__init__(
             column_mapping=column_mapping,
             modification_mapping=modification_mapping,
@@ -47,14 +44,15 @@ def __init__(  # noqa: PLR0913 many arguments in function definition
             **kwargs,
         )
 
-        self.mod_seq_column = "ModifiedPeptide"
         self._min_max_rt_norm = True
 
     def _load_file(self, filename: str) -> pd.DataFrame:
         csv_sep = self._get_table_delimiter(filename)
         df = pd.read_csv(filename, sep=csv_sep, keep_default_na=False)
 
-        self._find_mod_seq_column(df)
+        self.mod_seq_column = self._get_mod_seq_column(
+            df
+        )  # TODO: this needs to be removed
         if "ReferenceRun" in df.columns:
             df.drop_duplicates(
                 ["ReferenceRun", self.mod_seq_column, "PrecursorCharge"], inplace=True
@@ -84,9 +82,6 @@ def __init__(  # noqa: PLR0913 many arguments in function definition
         **kwargs,
     ):
         """SWATH or OpenSWATH library, similar to `SpectronautReader`."""
-        if mod_seq_columns is None:
-            mod_seq_columns = psm_reader_yaml["spectronaut"]["mod_seq_columns"]
-
         super().__init__(
             column_mapping=column_mapping,
             modification_mapping=modification_mapping,
@@ -126,7 +121,6 @@ def __init__(  # noqa: PLR0913 many arguments in function definition
             **kwargs,
         )
 
-        self.mod_seq_column = "Modified.Sequence"
         self._min_max_rt_norm = False
 
     def _load_file(self, filename: str) -> pd.DataFrame:
@@ -172,15 +166,16 @@ def __init__(  # noqa: PLR0913 many arguments in function definition
             **kwargs,
         )
 
-        self.precursor_column = "EG.PrecursorId"
-        self.mod_seq_column = "ModifiedSequence"
-
+        self.precursor_column = "EG.PrecursorId"  # TODO: move to yaml
         self._min_max_rt_norm = False
 
     def _load_file(self, filename: str) -> pd.DataFrame:
         csv_sep = self._get_table_delimiter(filename)
         df = pd.read_csv(filename, sep=csv_sep, keep_default_na=False)
 
+        self.mod_seq_column = self._get_mod_seq_column(
+            df
+        )  # TODO: this needs to be removed
         df[[self.mod_seq_column, PsmDfCols.CHARGE]] = df[
             self.precursor_column
         ].str.split(".", expand=True, n=2)

diff --git a/alphabase/psm_reader/maxquant_reader.py b/alphabase/psm_reader/maxquant_reader.py
@@ -177,23 +177,17 @@ def __init__(  # noqa: PLR0913 many arguments in function definition
             deprecated
 
         """
-        if mod_seq_columns is None:
-            mod_seq_columns = [
-                "Modified sequence"
-            ]  # TODO: why not take from psm_reader.yaml?
-
         super().__init__(
             column_mapping=column_mapping,
             modification_mapping=modification_mapping,
             fdr=fdr,
             keep_decoy=keep_decoy,
             rt_unit=rt_unit,
+            mod_seq_columns=mod_seq_columns,
             **kwargs,
         )
 
         self.fixed_C57 = fixed_C57
-        self._mod_seq_columns = mod_seq_columns
-        self.mod_seq_column = "Modified sequence"
 
     def _translate_decoy(self) -> None:
         if PsmDfCols.DECOY in self._psm_df.columns:
@@ -205,7 +199,6 @@ def _load_file(self, filename: str) -> pd.DataFrame:
         csv_sep = self._get_table_delimiter(filename)
         df = pd.read_csv(filename, sep=csv_sep, keep_default_na=False)
 
-        self._find_mod_seq_column(df)
         df = df[~pd.isna(df["Retention time"])]
         df.fillna("", inplace=True)
 

diff --git a/alphabase/psm_reader/psm_reader.py b/alphabase/psm_reader/psm_reader.py
@@ -35,14 +35,15 @@ class PSMReaderBase(ABC):
     # the typ of modification mapping to be used
     _modification_type: Optional[str] = None
 
-    def __init__(
+    def __init__(  # noqa: PLR0913 # too many arguments
         self,
         *,
         column_mapping: Optional[dict] = None,
         modification_mapping: Optional[dict] = None,
         fdr: float = 0.01,
         keep_decoy: bool = False,
         rt_unit: str = "minute",
+        mod_seq_columns: Optional[List[str]] = None,
         **kwargs,
     ):
         """The Base class for all PSMReaders.
@@ -53,20 +54,20 @@ def __init__(
         Parameters
         ----------
         column_mapping : dict, optional
-            A dict that maps alphabase's columns to other search engine's.
+            A dict that maps alphabase's columns to those of other search engines'.
+            If it is None, this dict will be read from psm_reader.yaml key `column_mapping`.
+
             The key of the column_mapping is alphabase's column name, and
             the value could be the column name or a list of column names
-            in other engine's result.
-            If it is None, this dict will be init by
-            `self._init_column_mapping`. The dict values could be
-            either str or list, for example:
+            in other engine's result, for example:
             ```
             columns_mapping = {
                 'sequence': 'NakedSequence',
                 'charge': 'Charge',
                 'proteins':['Proteins','UniprotIDs'] # list, this reader will automatically detect all of them.
             }
             ```
+            The first column name in the list will be mapped to the harmonized column names, the rest will be ignored.
             Defaults to None.
 
         modification_mapping : dict, optional
@@ -96,6 +97,12 @@ def __init__(
             The unit of RT in the search engine result.
             Defaults to 'minute'.
 
+        mod_seq_columns : list, optional
+            The columns to find modified sequences.
+            The first column name in the list will be used, the rest will be ignored.
+            By default read from psm_reader_yaml key "mod_seq_columns".
+            If it is not found there, an empty list is used.
+
         **kwargs: dict
             deprecated
 
@@ -139,7 +146,11 @@ def __init__(
         self._engine_rt_unit = rt_unit
         self._min_irt_value = -100
         self._max_irt_value = 200
-        self._mod_seq_columns = []
+        self._mod_seq_columns = (
+            mod_seq_columns
+            if mod_seq_columns is not None
+            else psm_reader_yaml[self._reader_type].get("mod_seq_columns", [])
+        )
 
         for key, value in kwargs.items():  # TODO: remove and remove kwargs
             warnings.warn(
@@ -174,12 +185,13 @@ def set_modification_mapping(
         """
         self._modification_mapper.set_modification_mapping(modification_mapping)
 
-    def _find_mod_seq_column(self, df: pd.DataFrame) -> None:  # called in _load_file
+    def _get_mod_seq_column(self, df: pd.DataFrame) -> Optional[str]:
+        """Get the first column from `_mod_seq_columns` that is a column of `df`."""
         for mod_seq_col in self._mod_seq_columns:
             if mod_seq_col in df.columns:
-                self.mod_seq_column = mod_seq_col
-                break
-            # TODO: warn if there's more
+                return mod_seq_col
+        return None
+        # TODO: warn if there's more
 
     def _read_column_mapping(self) -> Dict[str, str]:
         """Read column mapping from psm_reader yaml file."""
@@ -218,6 +230,9 @@ def import_file(self, _file: str) -> pd.DataFrame:
 
         """
         origin_df = self._load_file(_file)
+
+        self.mod_seq_column = self._get_mod_seq_column(origin_df)
+
         self._psm_df = pd.DataFrame()
 
         if len(origin_df):
@@ -304,17 +319,24 @@ def _load_file(self, filename: str) -> pd.DataFrame:
 
         """
 
-    def _find_mapped_columns(self, origin_df: pd.DataFrame) -> Dict[str, str]:
+    def _find_mapped_columns(self, df: pd.DataFrame) -> Dict[str, str]:
+        """Determine the mapping of AlphaBase columns to the columns in the given DataFrame.
+
+        For each AlphaBase column name, check if the corresponding search engine-specific
+        name is in the DataFrame columns. If it is, add it to the mapping.
+        If the searchengine-specific name is a list, use the first column name in the list.
+        """
         mapped_columns = {}
-        for col, map_col in self.column_mapping.items():
-            if isinstance(map_col, str):
-                if map_col in origin_df.columns:
-                    mapped_columns[col] = map_col
-            elif isinstance(map_col, (list, tuple)):
-                for other_col in map_col:
-                    if other_col in origin_df.columns:
-                        mapped_columns[col] = other_col
+        for col_alphabase, col_other in self.column_mapping.items():
+            if isinstance(col_other, str):
+                if col_other in df.columns:
+                    mapped_columns[col_alphabase] = col_other
+            elif isinstance(col_other, (list, tuple)):
+                for other_col in col_other:
+                    if other_col in df.columns:
+                        mapped_columns[col_alphabase] = other_col
                         break
+                        # TODO: warn if there's more
         return mapped_columns
 
     def _translate_columns(self, origin_df: pd.DataFrame) -> None:

diff --git a/alphabase/spectral_library/reader.py b/alphabase/spectral_library/reader.py
@@ -10,7 +10,6 @@
 from alphabase.peptide.mobility import mobility_to_ccs_for_df
 from alphabase.psm_reader.keys import LibPsmDfCols, PsmDfCols
 from alphabase.psm_reader.maxquant_reader import MaxQuantReader
-from alphabase.psm_reader.psm_reader import psm_reader_yaml
 from alphabase.spectral_library.base import SpecLibBase
 
 
@@ -88,9 +87,6 @@ def __init__(  # noqa: PLR0913 many arguments in function definition
             deprecated
 
         """
-        if mod_seq_columns is None:
-            mod_seq_columns = psm_reader_yaml["library_reader_base"]["mod_seq_columns"]
-
         SpecLibBase.__init__(
             self,
             charged_frag_types=charged_frag_types,
@@ -248,7 +244,7 @@ def _load_file(self, filename: str) -> pd.DataFrame:
         """Load the spectral library from a csv file."""
         csv_sep = self._get_table_delimiter(filename)
 
-        df = pd.read_csv(
+        return pd.read_csv(
             filename,
             sep=csv_sep,
             keep_default_na=False,
@@ -273,9 +269,6 @@ def _load_file(self, filename: str) -> pd.DataFrame:
                 "null",
             ],
         )
-        self._find_mod_seq_column(df)
-
-        return df
 
     def _post_process(
         self,