MannLabs · jalew188 · Mar 7, 2022 · Feb 24, 2022 · Feb 28, 2022 · Mar 4, 2022
diff --git a/alphabase/_nbdev.py b/alphabase/_nbdev.py
@@ -51,6 +51,7 @@
          "get_x_tandem_score": "alphapept_reader.ipynb",
          "AlphaPeptReader": "alphapept_reader.ipynb",
          "SpectronautReader": "dia_search_reader.ipynb",
+         "SwathReader": "dia_search_reader.ipynb",
          "DiannReader": "dia_search_reader.ipynb",
          "parse_mod_seq": "maxquant_reader.ipynb",
          "MaxQuantReader": "maxquant_reader.ipynb",

diff --git a/alphabase/io/psm_reader/dia_search_reader.py b/alphabase/io/psm_reader/dia_search_reader.py
@@ -1,6 +1,6 @@
 # AUTOGENERATED! DO NOT EDIT! File to edit: nbdev_nbs/io/psm_reader/dia_search_reader.ipynb (unless otherwise specified).
 
-__all__ = ['SpectronautReader', 'DiannReader']
+__all__ = ['SpectronautReader', 'SwathReader', 'DiannReader']
 
 # Cell
 import pandas as pd
@@ -67,6 +67,36 @@ def _load_file(self, filename):
 
         return df
 
+class SwathReader(SpectronautReader):
+    def __init__(self,
+        *,
+        column_mapping:dict = None,
+        modification_mapping:dict = None,
+        fdr = 0.01,
+        keep_decoy = False,
+        mod_sep = '()',
+        underscore_for_ncterm=False,
+        fixed_C57 = False,
+        mod_seq_columns=[
+            'ModifiedPeptide',
+            'ModifiedSequence',
+            'FullUniModPeptideName',
+        ],
+        csv_sep = '\t',
+        **kwargs,
+    ):
+        super().__init__(
+            column_mapping=column_mapping,
+            modification_mapping=modification_mapping,
+            fdr=fdr, keep_decoy=keep_decoy,
+            mod_sep=mod_sep,
+            underscore_for_ncterm=underscore_for_ncterm,
+            fixed_C57=fixed_C57,
+            mod_seq_columns=mod_seq_columns,
+            csv_sep=csv_sep,
+        )
+
+
 class DiannReader(SpectronautReader):
     def __init__(self,
         *,
@@ -106,7 +136,10 @@ def _load_file(self, filename):
     'spectronaut', SpectronautReader
 )
 psm_reader_provider.register_reader(
-    'openswath', SpectronautReader
+    'openswath', SwathReader
+)
+psm_reader_provider.register_reader(
+    'swath', SwathReader
 )
 psm_reader_provider.register_reader(
     'diann', DiannReader

diff --git a/alphabase/io/psm_reader/psm_reader.py b/alphabase/io/psm_reader/psm_reader.py
@@ -85,6 +85,7 @@ def __init__(self,
     ):
         """The Base class for all PSMReaders. The key of the sub-classes for different
         search engine format is to re-define `column_mapping` and `modification_mapping`.
+
         Args:
             column_mapping (dict, optional):
                 A dict that maps alphabase's columns to other search engine's.
@@ -114,6 +115,7 @@ def __init__(self,
                 Defaults to 0.01.
             keep_decoy(bool, optional): If keep decoy PSMs in self.psm_df.
                 Defautls to False.
+
         Attributes:
             column_mapping (dict): dict structure same as column_mapping in Args.
             modification_mapping (dict): dict structure same as modification_mapping in Args.
@@ -295,6 +297,9 @@ def _translate_columns(self, origin_df:pd.DataFrame):
                         self._psm_df[col] = origin_df[other_col]
                         break
 
+        if 'scan_num' in self._psm_df.columns:
+            self._psm_df['spec_idx'] = self._psm_df.scan_num - 1
+
 
     def _load_modifications(self, origin_df:pd.DataFrame):
         """Read modification information from 'origin_df'.

diff --git a/alphabase/io/tempmmap.py b/alphabase/io/tempmmap.py
@@ -0,0 +1,127 @@
+#!python
+"""This module allows to create temporary mmapped arrays."""
+
+# builtin
+import os
+import logging
+import atexit
+
+# external
+import numpy as np
+import mmap
+import h5py
+import tempfile
+
+
+_TEMP_DIR = tempfile.TemporaryDirectory(prefix="temp_mmap_")
+TEMP_DIR_NAME = _TEMP_DIR.name
+
+logging.warning(
+    f"Temp mmap arrays are written to {TEMP_DIR_NAME}. "
+    "Cleanup of this folder is OS dependant, "
+    "and might need to be triggered manually!"
+)
+
+
+def array(shape: tuple, dtype: np.dtype) -> np.ndarray:
+    """Create a writable temporary mmapped array.
+
+    Parameters
+    ----------
+    shape : tuple
+        A tuple with the shape of the array.
+    dtype : type
+        The np.dtype of the array.
+
+    Returns
+    -------
+    type
+        A writable temporary mmapped array.
+    """
+    temp_file_name = os.path.join(
+        TEMP_DIR_NAME,
+        f"temp_mmap_{np.random.randint(2**63)}.hdf"
+    )
+    with h5py.File(temp_file_name, "w") as hdf_file:
+        array = hdf_file.create_dataset(
+            "array",
+            shape=shape,
+            dtype=dtype
+        )
+        array[0] = 0
+        offset = array.id.get_offset()
+    with open(temp_file_name, "rb+") as raw_hdf_file:
+        mmap_obj = mmap.mmap(
+            raw_hdf_file.fileno(),
+            0,
+            access=mmap.ACCESS_WRITE
+        )
+        return np.frombuffer(
+            mmap_obj,
+            dtype=dtype,
+            count=np.prod(shape),
+            offset=offset
+        ).reshape(shape)
+
+
+def zeros(shape: tuple, dtype: np.dtype) -> np.ndarray:
+    """Create a writable temporary mmapped array filled with zeros.
+
+    Parameters
+    ----------
+    shape : tuple
+        A tuple with the shape of the array.
+    dtype : type
+        The np.dtype of the array.
+
+    Returns
+    -------
+    type
+        A writable temporary mmapped array filled with zeros.
+    """
+    _array = array(shape, dtype)
+    _array[:] = 0
+    return _array
+
+
+def ones(shape: tuple, dtype: np.dtype) -> np.ndarray:
+    """Create a writable temporary mmapped array filled with ones.
+
+    Parameters
+    ----------
+    shape : tuple
+        A tuple with the shape of the array.
+    dtype : type
+        The np.dtype of the array.
+
+    Returns
+    -------
+    type
+        A writable temporary mmapped array filled with ones.
+    """
+    _array = array(shape, dtype)
+    _array[:] = 1
+    return _array
+
+
+@atexit.register
+def clear() -> str:
+    """Reset the temporary folder containing temp mmapped arrays.
+
+    WARNING: All existing temp mmapp arrays will be unusable!
+
+    Returns
+    -------
+    str
+        The name of the new temporary folder.
+    """
+    global _TEMP_DIR
+    global TEMP_DIR_NAME
+    logging.warning(
+        f"Folder {TEMP_DIR_NAME} with temp mmap arrays is being deleted. "
+        "All existing temp mmapp arrays will be unusable!"
+    )
+    del _TEMP_DIR
+    _TEMP_DIR = tempfile.TemporaryDirectory(prefix="temp_mmap_")
+    TEMP_DIR_NAME = _TEMP_DIR.name
+    return TEMP_DIR_NAME
diff --git a/nbdev_nbs/io/psm_reader/dia_search_reader.ipynb b/nbdev_nbs/io/psm_reader/dia_search_reader.ipynb
@@ -87,6 +87,36 @@
     "        \n",
     "        return df\n",
     "\n",
+    "class SwathReader(SpectronautReader):\n",
+    "    def __init__(self,\n",
+    "        *,\n",
+    "        column_mapping:dict = None,\n",
+    "        modification_mapping:dict = None,\n",
+    "        fdr = 0.01,\n",
+    "        keep_decoy = False,\n",
+    "        mod_sep = '()',\n",
+    "        underscore_for_ncterm=False,\n",
+    "        fixed_C57 = False,\n",
+    "        mod_seq_columns=[\n",
+    "            'ModifiedPeptide',\n",
+    "            'ModifiedSequence',\n",
+    "            'FullUniModPeptideName',\n",
+    "        ],\n",
+    "        csv_sep = '\\t',\n",
+    "        **kwargs,\n",
+    "    ):\n",
+    "        super().__init__(\n",
+    "            column_mapping=column_mapping,\n",
+    "            modification_mapping=modification_mapping,\n",
+    "            fdr=fdr, keep_decoy=keep_decoy,\n",
+    "            mod_sep=mod_sep,\n",
+    "            underscore_for_ncterm=underscore_for_ncterm,\n",
+    "            fixed_C57=fixed_C57,\n",
+    "            mod_seq_columns=mod_seq_columns,\n",
+    "            csv_sep=csv_sep,\n",
+    "        )\n",
+    "\n",
+    "\n",
     "class DiannReader(SpectronautReader):\n",
     "    def __init__(self,\n",
     "        *,\n",
@@ -126,7 +156,10 @@
     "    'spectronaut', SpectronautReader\n",
     ")\n",
     "psm_reader_provider.register_reader(\n",
-    "    'openswath', SpectronautReader\n",
+    "    'openswath', SwathReader\n",
+    ")\n",
+    "psm_reader_provider.register_reader(\n",
+    "    'swath', SwathReader\n",
     ")\n",
     "psm_reader_provider.register_reader(\n",
     "    'diann', DiannReader\n",
@@ -291,93 +324,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>sequence</th>\n",
-       "      <th>charge</th>\n",
-       "      <th>rt</th>\n",
-       "      <th>precursor_mz</th>\n",
-       "      <th>mods</th>\n",
-       "      <th>mod_sites</th>\n",
-       "      <th>nAA</th>\n",
-       "      <th>rt_norm</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>AAAAAAAAAASGAAIPPLIPPRR</td>\n",
-       "      <td>3</td>\n",
-       "      <td>-10.0</td>\n",
-       "      <td>685.732240</td>\n",
-       "      <td></td>\n",
-       "      <td></td>\n",
-       "      <td>23</td>\n",
-       "      <td>0.000000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>AAAAAAAAAASGAAIPPLIPPRR</td>\n",
-       "      <td>4</td>\n",
-       "      <td>59.2</td>\n",
-       "      <td>514.550999</td>\n",
-       "      <td></td>\n",
-       "      <td></td>\n",
-       "      <td>23</td>\n",
-       "      <td>0.618962</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>AAAAAAAAAASGAAIPPLIPPRRVITLYQCFSVSQR</td>\n",
-       "      <td>5</td>\n",
-       "      <td>101.8</td>\n",
-       "      <td>728.201724</td>\n",
-       "      <td></td>\n",
-       "      <td></td>\n",
-       "      <td>36</td>\n",
-       "      <td>1.000000</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                               sequence  charge     rt  precursor_mz mods  \\\n",
-       "0               AAAAAAAAAASGAAIPPLIPPRR       3  -10.0    685.732240        \n",
-       "1               AAAAAAAAAASGAAIPPLIPPRR       4   59.2    514.550999        \n",
-       "2  AAAAAAAAAASGAAIPPLIPPRRVITLYQCFSVSQR       5  101.8    728.201724        \n",
-       "\n",
-       "  mod_sites  nAA   rt_norm  \n",
-       "0             23  0.000000  \n",
-       "1             23  0.618962  \n",
-       "2             36  1.000000  "
-      ]
-     },
-     "execution_count": null,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "from io import StringIO\n",
     "tsv = StringIO('''PrecursorMz\tProductMz\tTr_recalibrated\ttransition_name\tCE\tLibraryIntensity\ttransition_group_id\tdecoy\tPeptideSequence\tProteinName\tAnnotation\tFullUniModPeptideName\tPrecursorCharge\tGroupLabel\tUniprotID\tFragmentType\tFragmentCharge\tFragmentSeriesNumber\n",
@@ -396,8 +343,9 @@
     "\n",
     "\n",
     "osw_reader = psm_reader_provider.get_reader('openswath')\n",
-    "osw_reader.import_file(tsv)\n",
-    "osw_reader.psm_df"
+    "psm_df = osw_reader.import_file(tsv)\n",
+    "assert psm_df.loc[2,'mod_sites'] == '30'\n",
+    "assert psm_df.loc[2,'mods'] == 'Carbamidomethyl@C'"
    ]
   },
   {
@@ -790,8 +738,7 @@
     "F:\\XXX\\20201218_tims03_Evo03_PS_SA_HeLa_200ng_high_speed_21min_8cm_S2-A8_1_22642.d\t20201218_tims03_Evo03_PS_SA_HeLa_200ng_high_speed_21min_8cm_S2-A8_1_22642\tP28482\tP28482\t\tMAPK1\t69911.3\t68996.2\t63388.2\t69911.3\t68996.2\t63388.2\t63388.2\t(UniMod:1)AAAAAAGAGPEM(UniMod:35)VR\tAAAAAAGAGPEMVR\t(UniMod:1)AAAAAAGAGPEM(UniMod:35)VR2\t2\t0.00122498\t0.000834654\t0.000152765\t0.000152765\t0.000146135\t0.000154631\t0\t1\t1572.67\t1552.08\t1572.67\t0.906427\t7.45711\t7.40943\t7.50482\t15.9025\t7.43922\t16.0749\t0\t0.371998\t5937.05\t0.30888\t0.510876\t0.72688\t0.95182\t1.96259\t0.65474\t1320.01;838.009;638.006;827.009;562.005;339.003;\t1320.01;252.656;0;213.073;330.325;0;\t0.976001;0.542934;0.346963;0.38014;0.442774;-0.259898;\t11239\t1.01773\t1.0262\t1.02509\t1.01834\n",
     "''')\n",
     "diann_reader = psm_reader_provider.get_reader('diann')\n",
-    "diann_reader.import_file(tsv)\n",
-    "diann_reader.psm_df"
+    "diann_reader.import_file(tsv)"
    ]
   },
   {
@@ -866,6 +813,13 @@
     "    'S[UniMod:21]'])\n",
     ")"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {