diff --git a/alphabase/_nbdev.py b/alphabase/_nbdev.py index 69980026..cd0b81d4 100644 --- a/alphabase/_nbdev.py +++ b/alphabase/_nbdev.py @@ -51,6 +51,7 @@ "get_x_tandem_score": "alphapept_reader.ipynb", "AlphaPeptReader": "alphapept_reader.ipynb", "SpectronautReader": "dia_search_reader.ipynb", + "SwathReader": "dia_search_reader.ipynb", "DiannReader": "dia_search_reader.ipynb", "parse_mod_seq": "maxquant_reader.ipynb", "MaxQuantReader": "maxquant_reader.ipynb", diff --git a/alphabase/io/psm_reader/dia_search_reader.py b/alphabase/io/psm_reader/dia_search_reader.py index 8c6f8ed2..97456ff1 100644 --- a/alphabase/io/psm_reader/dia_search_reader.py +++ b/alphabase/io/psm_reader/dia_search_reader.py @@ -1,6 +1,6 @@ # AUTOGENERATED! DO NOT EDIT! File to edit: nbdev_nbs/io/psm_reader/dia_search_reader.ipynb (unless otherwise specified). -__all__ = ['SpectronautReader', 'DiannReader'] +__all__ = ['SpectronautReader', 'SwathReader', 'DiannReader'] # Cell import pandas as pd @@ -67,6 +67,36 @@ def _load_file(self, filename): return df +class SwathReader(SpectronautReader): + def __init__(self, + *, + column_mapping:dict = None, + modification_mapping:dict = None, + fdr = 0.01, + keep_decoy = False, + mod_sep = '()', + underscore_for_ncterm=False, + fixed_C57 = False, + mod_seq_columns=[ + 'ModifiedPeptide', + 'ModifiedSequence', + 'FullUniModPeptideName', + ], + csv_sep = '\t', + **kwargs, + ): + super().__init__( + column_mapping=column_mapping, + modification_mapping=modification_mapping, + fdr=fdr, keep_decoy=keep_decoy, + mod_sep=mod_sep, + underscore_for_ncterm=underscore_for_ncterm, + fixed_C57=fixed_C57, + mod_seq_columns=mod_seq_columns, + csv_sep=csv_sep, + ) + + class DiannReader(SpectronautReader): def __init__(self, *, @@ -106,7 +136,10 @@ def _load_file(self, filename): 'spectronaut', SpectronautReader ) psm_reader_provider.register_reader( - 'openswath', SpectronautReader + 'openswath', SwathReader +) +psm_reader_provider.register_reader( + 'swath', SwathReader ) psm_reader_provider.register_reader( 'diann', DiannReader diff --git a/alphabase/io/psm_reader/psm_reader.py b/alphabase/io/psm_reader/psm_reader.py index 5f1439c7..d5a6b54d 100644 --- a/alphabase/io/psm_reader/psm_reader.py +++ b/alphabase/io/psm_reader/psm_reader.py @@ -85,6 +85,7 @@ def __init__(self, ): """The Base class for all PSMReaders. The key of the sub-classes for different search engine format is to re-define `column_mapping` and `modification_mapping`. + Args: column_mapping (dict, optional): A dict that maps alphabase's columns to other search engine's. @@ -114,6 +115,7 @@ def __init__(self, Defaults to 0.01. keep_decoy(bool, optional): If keep decoy PSMs in self.psm_df. Defautls to False. + Attributes: column_mapping (dict): dict structure same as column_mapping in Args. modification_mapping (dict): dict structure same as modification_mapping in Args. @@ -295,6 +297,9 @@ def _translate_columns(self, origin_df:pd.DataFrame): self._psm_df[col] = origin_df[other_col] break + if 'scan_num' in self._psm_df.columns: + self._psm_df['spec_idx'] = self._psm_df.scan_num - 1 + def _load_modifications(self, origin_df:pd.DataFrame): """Read modification information from 'origin_df'. diff --git a/alphabase/io/tempmmap.py b/alphabase/io/tempmmap.py new file mode 100644 index 00000000..b20adf2d --- /dev/null +++ b/alphabase/io/tempmmap.py @@ -0,0 +1,127 @@ +#!python +"""This module allows to create temporary mmapped arrays.""" + +# builtin +import os +import logging +import atexit + +# external +import numpy as np +import mmap +import h5py +import tempfile + + +_TEMP_DIR = tempfile.TemporaryDirectory(prefix="temp_mmap_") +TEMP_DIR_NAME = _TEMP_DIR.name + +logging.warning( + f"Temp mmap arrays are written to {TEMP_DIR_NAME}. " + "Cleanup of this folder is OS dependant, " + "and might need to be triggered manually!" +) + + +def array(shape: tuple, dtype: np.dtype) -> np.ndarray: + """Create a writable temporary mmapped array. + + Parameters + ---------- + shape : tuple + A tuple with the shape of the array. + dtype : type + The np.dtype of the array. + + Returns + ------- + type + A writable temporary mmapped array. + """ + temp_file_name = os.path.join( + TEMP_DIR_NAME, + f"temp_mmap_{np.random.randint(2**63)}.hdf" + ) + with h5py.File(temp_file_name, "w") as hdf_file: + array = hdf_file.create_dataset( + "array", + shape=shape, + dtype=dtype + ) + array[0] = 0 + offset = array.id.get_offset() + with open(temp_file_name, "rb+") as raw_hdf_file: + mmap_obj = mmap.mmap( + raw_hdf_file.fileno(), + 0, + access=mmap.ACCESS_WRITE + ) + return np.frombuffer( + mmap_obj, + dtype=dtype, + count=np.prod(shape), + offset=offset + ).reshape(shape) + + +def zeros(shape: tuple, dtype: np.dtype) -> np.ndarray: + """Create a writable temporary mmapped array filled with zeros. + + Parameters + ---------- + shape : tuple + A tuple with the shape of the array. + dtype : type + The np.dtype of the array. + + Returns + ------- + type + A writable temporary mmapped array filled with zeros. + """ + _array = array(shape, dtype) + _array[:] = 0 + return _array + + +def ones(shape: tuple, dtype: np.dtype) -> np.ndarray: + """Create a writable temporary mmapped array filled with ones. + + Parameters + ---------- + shape : tuple + A tuple with the shape of the array. + dtype : type + The np.dtype of the array. + + Returns + ------- + type + A writable temporary mmapped array filled with ones. + """ + _array = array(shape, dtype) + _array[:] = 1 + return _array + + +@atexit.register +def clear() -> str: + """Reset the temporary folder containing temp mmapped arrays. + + WARNING: All existing temp mmapp arrays will be unusable! + + Returns + ------- + str + The name of the new temporary folder. + """ + global _TEMP_DIR + global TEMP_DIR_NAME + logging.warning( + f"Folder {TEMP_DIR_NAME} with temp mmap arrays is being deleted. " + "All existing temp mmapp arrays will be unusable!" + ) + del _TEMP_DIR + _TEMP_DIR = tempfile.TemporaryDirectory(prefix="temp_mmap_") + TEMP_DIR_NAME = _TEMP_DIR.name + return TEMP_DIR_NAME diff --git a/nbdev_nbs/io/psm_reader/dia_search_reader.ipynb b/nbdev_nbs/io/psm_reader/dia_search_reader.ipynb index 76bbcf55..cca588ee 100644 --- a/nbdev_nbs/io/psm_reader/dia_search_reader.ipynb +++ b/nbdev_nbs/io/psm_reader/dia_search_reader.ipynb @@ -87,6 +87,36 @@ " \n", " return df\n", "\n", + "class SwathReader(SpectronautReader):\n", + " def __init__(self,\n", + " *,\n", + " column_mapping:dict = None,\n", + " modification_mapping:dict = None,\n", + " fdr = 0.01,\n", + " keep_decoy = False,\n", + " mod_sep = '()',\n", + " underscore_for_ncterm=False,\n", + " fixed_C57 = False,\n", + " mod_seq_columns=[\n", + " 'ModifiedPeptide',\n", + " 'ModifiedSequence',\n", + " 'FullUniModPeptideName',\n", + " ],\n", + " csv_sep = '\\t',\n", + " **kwargs,\n", + " ):\n", + " super().__init__(\n", + " column_mapping=column_mapping,\n", + " modification_mapping=modification_mapping,\n", + " fdr=fdr, keep_decoy=keep_decoy,\n", + " mod_sep=mod_sep,\n", + " underscore_for_ncterm=underscore_for_ncterm,\n", + " fixed_C57=fixed_C57,\n", + " mod_seq_columns=mod_seq_columns,\n", + " csv_sep=csv_sep,\n", + " )\n", + "\n", + "\n", "class DiannReader(SpectronautReader):\n", " def __init__(self,\n", " *,\n", @@ -126,7 +156,10 @@ " 'spectronaut', SpectronautReader\n", ")\n", "psm_reader_provider.register_reader(\n", - " 'openswath', SpectronautReader\n", + " 'openswath', SwathReader\n", + ")\n", + "psm_reader_provider.register_reader(\n", + " 'swath', SwathReader\n", ")\n", "psm_reader_provider.register_reader(\n", " 'diann', DiannReader\n", @@ -291,93 +324,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
sequencechargertprecursor_mzmodsmod_sitesnAArt_norm
0AAAAAAAAAASGAAIPPLIPPRR3-10.0685.732240230.000000
1AAAAAAAAAASGAAIPPLIPPRR459.2514.550999230.618962
2AAAAAAAAAASGAAIPPLIPPRRVITLYQCFSVSQR5101.8728.201724361.000000
\n", - "
" - ], - "text/plain": [ - " sequence charge rt precursor_mz mods \\\n", - "0 AAAAAAAAAASGAAIPPLIPPRR 3 -10.0 685.732240 \n", - "1 AAAAAAAAAASGAAIPPLIPPRR 4 59.2 514.550999 \n", - "2 AAAAAAAAAASGAAIPPLIPPRRVITLYQCFSVSQR 5 101.8 728.201724 \n", - "\n", - " mod_sites nAA rt_norm \n", - "0 23 0.000000 \n", - "1 23 0.618962 \n", - "2 36 1.000000 " - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "from io import StringIO\n", "tsv = StringIO('''PrecursorMz\tProductMz\tTr_recalibrated\ttransition_name\tCE\tLibraryIntensity\ttransition_group_id\tdecoy\tPeptideSequence\tProteinName\tAnnotation\tFullUniModPeptideName\tPrecursorCharge\tGroupLabel\tUniprotID\tFragmentType\tFragmentCharge\tFragmentSeriesNumber\n", @@ -396,8 +343,9 @@ "\n", "\n", "osw_reader = psm_reader_provider.get_reader('openswath')\n", - "osw_reader.import_file(tsv)\n", - "osw_reader.psm_df" + "psm_df = osw_reader.import_file(tsv)\n", + "assert psm_df.loc[2,'mod_sites'] == '30'\n", + "assert psm_df.loc[2,'mods'] == 'Carbamidomethyl@C'" ] }, { @@ -790,8 +738,7 @@ "F:\\XXX\\20201218_tims03_Evo03_PS_SA_HeLa_200ng_high_speed_21min_8cm_S2-A8_1_22642.d\t20201218_tims03_Evo03_PS_SA_HeLa_200ng_high_speed_21min_8cm_S2-A8_1_22642\tP28482\tP28482\t\tMAPK1\t69911.3\t68996.2\t63388.2\t69911.3\t68996.2\t63388.2\t63388.2\t(UniMod:1)AAAAAAGAGPEM(UniMod:35)VR\tAAAAAAGAGPEMVR\t(UniMod:1)AAAAAAGAGPEM(UniMod:35)VR2\t2\t0.00122498\t0.000834654\t0.000152765\t0.000152765\t0.000146135\t0.000154631\t0\t1\t1572.67\t1552.08\t1572.67\t0.906427\t7.45711\t7.40943\t7.50482\t15.9025\t7.43922\t16.0749\t0\t0.371998\t5937.05\t0.30888\t0.510876\t0.72688\t0.95182\t1.96259\t0.65474\t1320.01;838.009;638.006;827.009;562.005;339.003;\t1320.01;252.656;0;213.073;330.325;0;\t0.976001;0.542934;0.346963;0.38014;0.442774;-0.259898;\t11239\t1.01773\t1.0262\t1.02509\t1.01834\n", "''')\n", "diann_reader = psm_reader_provider.get_reader('diann')\n", - "diann_reader.import_file(tsv)\n", - "diann_reader.psm_df" + "diann_reader.import_file(tsv)" ] }, { @@ -866,6 +813,13 @@ " 'S[UniMod:21]'])\n", ")" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/nbdev_nbs/io/psm_reader/psm_reader.ipynb b/nbdev_nbs/io/psm_reader/psm_reader.ipynb index 1299c47d..3a6af8c5 100644 --- a/nbdev_nbs/io/psm_reader/psm_reader.ipynb +++ b/nbdev_nbs/io/psm_reader/psm_reader.ipynb @@ -173,7 +173,7 @@ "15. `score` (float): PSM score. The larger the better PSMs, meaning that `E-value` or `P-value` scores must be `-log()`.\n", "16. `fdr` (float): FDR or q-value.\n", "17. `raw_name` (str): Raw file name.\n", - "18. `spec_idx` (int): scan number in Thermo RAW data, or spectrum index for other RAW data. We can use it to locate the MS2 spectrum for identification.\n", + "18. `spec_idx` (int): spectrum index starting from 0 in RAW data. For thermo RAW, it is also Scan number - 1. We can use it to locate the MS2 spectrum for identification.\n", "19. `query_id` (int or str): the unique id for not only inlucdes unique spectrum (`spec_idx`), but also the precursor or MS1 isotope index. It could be `query_idx` in alphapept.\n", "20. `decoy`: 0 if the peptide is target match, otherwise 1." ] @@ -197,6 +197,7 @@ " ):\n", " \"\"\"The Base class for all PSMReaders. The key of the sub-classes for different \n", " search engine format is to re-define `column_mapping` and `modification_mapping`.\n", + " \n", " Args:\n", " column_mapping (dict, optional): \n", " A dict that maps alphabase's columns to other search engine's.\n", @@ -226,6 +227,7 @@ " Defaults to 0.01.\n", " keep_decoy(bool, optional): If keep decoy PSMs in self.psm_df.\n", " Defautls to False.\n", + " \n", " Attributes:\n", " column_mapping (dict): dict structure same as column_mapping in Args.\n", " modification_mapping (dict): dict structure same as modification_mapping in Args.\n", @@ -406,6 +408,9 @@ " if other_col in origin_df.columns:\n", " self._psm_df[col] = origin_df[other_col]\n", " break\n", + " \n", + " if 'scan_num' in self._psm_df.columns:\n", + " self._psm_df['spec_idx'] = self._psm_df.scan_num - 1\n", " \n", "\n", " def _load_modifications(self, origin_df:pd.DataFrame):\n",