diff --git a/alphabase/_nbdev.py b/alphabase/_nbdev.py index 69980026..cd0b81d4 100644 --- a/alphabase/_nbdev.py +++ b/alphabase/_nbdev.py @@ -51,6 +51,7 @@ "get_x_tandem_score": "alphapept_reader.ipynb", "AlphaPeptReader": "alphapept_reader.ipynb", "SpectronautReader": "dia_search_reader.ipynb", + "SwathReader": "dia_search_reader.ipynb", "DiannReader": "dia_search_reader.ipynb", "parse_mod_seq": "maxquant_reader.ipynb", "MaxQuantReader": "maxquant_reader.ipynb", diff --git a/alphabase/io/psm_reader/dia_search_reader.py b/alphabase/io/psm_reader/dia_search_reader.py index 8c6f8ed2..97456ff1 100644 --- a/alphabase/io/psm_reader/dia_search_reader.py +++ b/alphabase/io/psm_reader/dia_search_reader.py @@ -1,6 +1,6 @@ # AUTOGENERATED! DO NOT EDIT! File to edit: nbdev_nbs/io/psm_reader/dia_search_reader.ipynb (unless otherwise specified). -__all__ = ['SpectronautReader', 'DiannReader'] +__all__ = ['SpectronautReader', 'SwathReader', 'DiannReader'] # Cell import pandas as pd @@ -67,6 +67,36 @@ def _load_file(self, filename): return df +class SwathReader(SpectronautReader): + def __init__(self, + *, + column_mapping:dict = None, + modification_mapping:dict = None, + fdr = 0.01, + keep_decoy = False, + mod_sep = '()', + underscore_for_ncterm=False, + fixed_C57 = False, + mod_seq_columns=[ + 'ModifiedPeptide', + 'ModifiedSequence', + 'FullUniModPeptideName', + ], + csv_sep = '\t', + **kwargs, + ): + super().__init__( + column_mapping=column_mapping, + modification_mapping=modification_mapping, + fdr=fdr, keep_decoy=keep_decoy, + mod_sep=mod_sep, + underscore_for_ncterm=underscore_for_ncterm, + fixed_C57=fixed_C57, + mod_seq_columns=mod_seq_columns, + csv_sep=csv_sep, + ) + + class DiannReader(SpectronautReader): def __init__(self, *, @@ -106,7 +136,10 @@ def _load_file(self, filename): 'spectronaut', SpectronautReader ) psm_reader_provider.register_reader( - 'openswath', SpectronautReader + 'openswath', SwathReader +) +psm_reader_provider.register_reader( + 'swath', SwathReader ) psm_reader_provider.register_reader( 'diann', DiannReader diff --git a/alphabase/io/psm_reader/psm_reader.py b/alphabase/io/psm_reader/psm_reader.py index 5f1439c7..d5a6b54d 100644 --- a/alphabase/io/psm_reader/psm_reader.py +++ b/alphabase/io/psm_reader/psm_reader.py @@ -85,6 +85,7 @@ def __init__(self, ): """The Base class for all PSMReaders. The key of the sub-classes for different search engine format is to re-define `column_mapping` and `modification_mapping`. + Args: column_mapping (dict, optional): A dict that maps alphabase's columns to other search engine's. @@ -114,6 +115,7 @@ def __init__(self, Defaults to 0.01. keep_decoy(bool, optional): If keep decoy PSMs in self.psm_df. Defautls to False. + Attributes: column_mapping (dict): dict structure same as column_mapping in Args. modification_mapping (dict): dict structure same as modification_mapping in Args. @@ -295,6 +297,9 @@ def _translate_columns(self, origin_df:pd.DataFrame): self._psm_df[col] = origin_df[other_col] break + if 'scan_num' in self._psm_df.columns: + self._psm_df['spec_idx'] = self._psm_df.scan_num - 1 + def _load_modifications(self, origin_df:pd.DataFrame): """Read modification information from 'origin_df'. diff --git a/alphabase/io/tempmmap.py b/alphabase/io/tempmmap.py new file mode 100644 index 00000000..b20adf2d --- /dev/null +++ b/alphabase/io/tempmmap.py @@ -0,0 +1,127 @@ +#!python +"""This module allows to create temporary mmapped arrays.""" + +# builtin +import os +import logging +import atexit + +# external +import numpy as np +import mmap +import h5py +import tempfile + + +_TEMP_DIR = tempfile.TemporaryDirectory(prefix="temp_mmap_") +TEMP_DIR_NAME = _TEMP_DIR.name + +logging.warning( + f"Temp mmap arrays are written to {TEMP_DIR_NAME}. " + "Cleanup of this folder is OS dependant, " + "and might need to be triggered manually!" +) + + +def array(shape: tuple, dtype: np.dtype) -> np.ndarray: + """Create a writable temporary mmapped array. + + Parameters + ---------- + shape : tuple + A tuple with the shape of the array. + dtype : type + The np.dtype of the array. + + Returns + ------- + type + A writable temporary mmapped array. + """ + temp_file_name = os.path.join( + TEMP_DIR_NAME, + f"temp_mmap_{np.random.randint(2**63)}.hdf" + ) + with h5py.File(temp_file_name, "w") as hdf_file: + array = hdf_file.create_dataset( + "array", + shape=shape, + dtype=dtype + ) + array[0] = 0 + offset = array.id.get_offset() + with open(temp_file_name, "rb+") as raw_hdf_file: + mmap_obj = mmap.mmap( + raw_hdf_file.fileno(), + 0, + access=mmap.ACCESS_WRITE + ) + return np.frombuffer( + mmap_obj, + dtype=dtype, + count=np.prod(shape), + offset=offset + ).reshape(shape) + + +def zeros(shape: tuple, dtype: np.dtype) -> np.ndarray: + """Create a writable temporary mmapped array filled with zeros. + + Parameters + ---------- + shape : tuple + A tuple with the shape of the array. + dtype : type + The np.dtype of the array. + + Returns + ------- + type + A writable temporary mmapped array filled with zeros. + """ + _array = array(shape, dtype) + _array[:] = 0 + return _array + + +def ones(shape: tuple, dtype: np.dtype) -> np.ndarray: + """Create a writable temporary mmapped array filled with ones. + + Parameters + ---------- + shape : tuple + A tuple with the shape of the array. + dtype : type + The np.dtype of the array. + + Returns + ------- + type + A writable temporary mmapped array filled with ones. + """ + _array = array(shape, dtype) + _array[:] = 1 + return _array + + +@atexit.register +def clear() -> str: + """Reset the temporary folder containing temp mmapped arrays. + + WARNING: All existing temp mmapp arrays will be unusable! + + Returns + ------- + str + The name of the new temporary folder. + """ + global _TEMP_DIR + global TEMP_DIR_NAME + logging.warning( + f"Folder {TEMP_DIR_NAME} with temp mmap arrays is being deleted. " + "All existing temp mmapp arrays will be unusable!" + ) + del _TEMP_DIR + _TEMP_DIR = tempfile.TemporaryDirectory(prefix="temp_mmap_") + TEMP_DIR_NAME = _TEMP_DIR.name + return TEMP_DIR_NAME diff --git a/nbdev_nbs/io/psm_reader/dia_search_reader.ipynb b/nbdev_nbs/io/psm_reader/dia_search_reader.ipynb index 76bbcf55..cca588ee 100644 --- a/nbdev_nbs/io/psm_reader/dia_search_reader.ipynb +++ b/nbdev_nbs/io/psm_reader/dia_search_reader.ipynb @@ -87,6 +87,36 @@ " \n", " return df\n", "\n", + "class SwathReader(SpectronautReader):\n", + " def __init__(self,\n", + " *,\n", + " column_mapping:dict = None,\n", + " modification_mapping:dict = None,\n", + " fdr = 0.01,\n", + " keep_decoy = False,\n", + " mod_sep = '()',\n", + " underscore_for_ncterm=False,\n", + " fixed_C57 = False,\n", + " mod_seq_columns=[\n", + " 'ModifiedPeptide',\n", + " 'ModifiedSequence',\n", + " 'FullUniModPeptideName',\n", + " ],\n", + " csv_sep = '\\t',\n", + " **kwargs,\n", + " ):\n", + " super().__init__(\n", + " column_mapping=column_mapping,\n", + " modification_mapping=modification_mapping,\n", + " fdr=fdr, keep_decoy=keep_decoy,\n", + " mod_sep=mod_sep,\n", + " underscore_for_ncterm=underscore_for_ncterm,\n", + " fixed_C57=fixed_C57,\n", + " mod_seq_columns=mod_seq_columns,\n", + " csv_sep=csv_sep,\n", + " )\n", + "\n", + "\n", "class DiannReader(SpectronautReader):\n", " def __init__(self,\n", " *,\n", @@ -126,7 +156,10 @@ " 'spectronaut', SpectronautReader\n", ")\n", "psm_reader_provider.register_reader(\n", - " 'openswath', SpectronautReader\n", + " 'openswath', SwathReader\n", + ")\n", + "psm_reader_provider.register_reader(\n", + " 'swath', SwathReader\n", ")\n", "psm_reader_provider.register_reader(\n", " 'diann', DiannReader\n", @@ -291,93 +324,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - " | sequence | \n", - "charge | \n", - "rt | \n", - "precursor_mz | \n", - "mods | \n", - "mod_sites | \n", - "nAA | \n", - "rt_norm | \n", - "
---|---|---|---|---|---|---|---|---|
0 | \n", - "AAAAAAAAAASGAAIPPLIPPRR | \n", - "3 | \n", - "-10.0 | \n", - "685.732240 | \n", - "\n", - " | \n", - " | 23 | \n", - "0.000000 | \n", - "
1 | \n", - "AAAAAAAAAASGAAIPPLIPPRR | \n", - "4 | \n", - "59.2 | \n", - "514.550999 | \n", - "\n", - " | \n", - " | 23 | \n", - "0.618962 | \n", - "
2 | \n", - "AAAAAAAAAASGAAIPPLIPPRRVITLYQCFSVSQR | \n", - "5 | \n", - "101.8 | \n", - "728.201724 | \n", - "\n", - " | \n", - " | 36 | \n", - "1.000000 | \n", - "