Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix spec idx #23

Merged
merged 7 commits into from
Mar 7, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions alphabase/_nbdev.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
"get_x_tandem_score": "alphapept_reader.ipynb",
"AlphaPeptReader": "alphapept_reader.ipynb",
"SpectronautReader": "dia_search_reader.ipynb",
"SwathReader": "dia_search_reader.ipynb",
"DiannReader": "dia_search_reader.ipynb",
"parse_mod_seq": "maxquant_reader.ipynb",
"MaxQuantReader": "maxquant_reader.ipynb",
Expand Down
37 changes: 35 additions & 2 deletions alphabase/io/psm_reader/dia_search_reader.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# AUTOGENERATED! DO NOT EDIT! File to edit: nbdev_nbs/io/psm_reader/dia_search_reader.ipynb (unless otherwise specified).

__all__ = ['SpectronautReader', 'DiannReader']
__all__ = ['SpectronautReader', 'SwathReader', 'DiannReader']

# Cell
import pandas as pd
Expand Down Expand Up @@ -67,6 +67,36 @@ def _load_file(self, filename):

return df

class SwathReader(SpectronautReader):
def __init__(self,
*,
column_mapping:dict = None,
modification_mapping:dict = None,
fdr = 0.01,
keep_decoy = False,
mod_sep = '()',
underscore_for_ncterm=False,
fixed_C57 = False,
mod_seq_columns=[
'ModifiedPeptide',
'ModifiedSequence',
'FullUniModPeptideName',
],
csv_sep = '\t',
**kwargs,
):
super().__init__(
column_mapping=column_mapping,
modification_mapping=modification_mapping,
fdr=fdr, keep_decoy=keep_decoy,
mod_sep=mod_sep,
underscore_for_ncterm=underscore_for_ncterm,
fixed_C57=fixed_C57,
mod_seq_columns=mod_seq_columns,
csv_sep=csv_sep,
)


class DiannReader(SpectronautReader):
def __init__(self,
*,
Expand Down Expand Up @@ -106,7 +136,10 @@ def _load_file(self, filename):
'spectronaut', SpectronautReader
)
psm_reader_provider.register_reader(
'openswath', SpectronautReader
'openswath', SwathReader
)
psm_reader_provider.register_reader(
'swath', SwathReader
)
psm_reader_provider.register_reader(
'diann', DiannReader
Expand Down
5 changes: 5 additions & 0 deletions alphabase/io/psm_reader/psm_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ def __init__(self,
):
"""The Base class for all PSMReaders. The key of the sub-classes for different
search engine format is to re-define `column_mapping` and `modification_mapping`.

Args:
column_mapping (dict, optional):
A dict that maps alphabase's columns to other search engine's.
Expand Down Expand Up @@ -114,6 +115,7 @@ def __init__(self,
Defaults to 0.01.
keep_decoy(bool, optional): If keep decoy PSMs in self.psm_df.
Defautls to False.

Attributes:
column_mapping (dict): dict structure same as column_mapping in Args.
modification_mapping (dict): dict structure same as modification_mapping in Args.
Expand Down Expand Up @@ -295,6 +297,9 @@ def _translate_columns(self, origin_df:pd.DataFrame):
self._psm_df[col] = origin_df[other_col]
break

if 'scan_num' in self._psm_df.columns:
self._psm_df['spec_idx'] = self._psm_df.scan_num - 1


def _load_modifications(self, origin_df:pd.DataFrame):
"""Read modification information from 'origin_df'.
Expand Down
127 changes: 127 additions & 0 deletions alphabase/io/tempmmap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
#!python
"""This module allows to create temporary mmapped arrays."""

# builtin
import os
import logging
import atexit

# external
import numpy as np
import mmap
import h5py
import tempfile


_TEMP_DIR = tempfile.TemporaryDirectory(prefix="temp_mmap_")
TEMP_DIR_NAME = _TEMP_DIR.name

logging.warning(
f"Temp mmap arrays are written to {TEMP_DIR_NAME}. "
"Cleanup of this folder is OS dependant, "
"and might need to be triggered manually!"
)


def array(shape: tuple, dtype: np.dtype) -> np.ndarray:
"""Create a writable temporary mmapped array.

Parameters
----------
shape : tuple
A tuple with the shape of the array.
dtype : type
The np.dtype of the array.

Returns
-------
type
A writable temporary mmapped array.
"""
temp_file_name = os.path.join(
TEMP_DIR_NAME,
f"temp_mmap_{np.random.randint(2**63)}.hdf"
)
with h5py.File(temp_file_name, "w") as hdf_file:
array = hdf_file.create_dataset(
"array",
shape=shape,
dtype=dtype
)
array[0] = 0
offset = array.id.get_offset()
with open(temp_file_name, "rb+") as raw_hdf_file:
mmap_obj = mmap.mmap(
raw_hdf_file.fileno(),
0,
access=mmap.ACCESS_WRITE
)
return np.frombuffer(
mmap_obj,
dtype=dtype,
count=np.prod(shape),
offset=offset
).reshape(shape)


def zeros(shape: tuple, dtype: np.dtype) -> np.ndarray:
"""Create a writable temporary mmapped array filled with zeros.

Parameters
----------
shape : tuple
A tuple with the shape of the array.
dtype : type
The np.dtype of the array.

Returns
-------
type
A writable temporary mmapped array filled with zeros.
"""
_array = array(shape, dtype)
_array[:] = 0
return _array


def ones(shape: tuple, dtype: np.dtype) -> np.ndarray:
"""Create a writable temporary mmapped array filled with ones.

Parameters
----------
shape : tuple
A tuple with the shape of the array.
dtype : type
The np.dtype of the array.

Returns
-------
type
A writable temporary mmapped array filled with ones.
"""
_array = array(shape, dtype)
_array[:] = 1
return _array


@atexit.register
def clear() -> str:
"""Reset the temporary folder containing temp mmapped arrays.

WARNING: All existing temp mmapp arrays will be unusable!

Returns
-------
str
The name of the new temporary folder.
"""
global _TEMP_DIR
global TEMP_DIR_NAME
logging.warning(
f"Folder {TEMP_DIR_NAME} with temp mmap arrays is being deleted. "
"All existing temp mmapp arrays will be unusable!"
)
del _TEMP_DIR
_TEMP_DIR = tempfile.TemporaryDirectory(prefix="temp_mmap_")
TEMP_DIR_NAME = _TEMP_DIR.name
return TEMP_DIR_NAME
138 changes: 46 additions & 92 deletions nbdev_nbs/io/psm_reader/dia_search_reader.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,36 @@
" \n",
" return df\n",
"\n",
"class SwathReader(SpectronautReader):\n",
" def __init__(self,\n",
" *,\n",
" column_mapping:dict = None,\n",
" modification_mapping:dict = None,\n",
" fdr = 0.01,\n",
" keep_decoy = False,\n",
" mod_sep = '()',\n",
" underscore_for_ncterm=False,\n",
" fixed_C57 = False,\n",
" mod_seq_columns=[\n",
" 'ModifiedPeptide',\n",
" 'ModifiedSequence',\n",
" 'FullUniModPeptideName',\n",
" ],\n",
" csv_sep = '\\t',\n",
" **kwargs,\n",
" ):\n",
" super().__init__(\n",
" column_mapping=column_mapping,\n",
" modification_mapping=modification_mapping,\n",
" fdr=fdr, keep_decoy=keep_decoy,\n",
" mod_sep=mod_sep,\n",
" underscore_for_ncterm=underscore_for_ncterm,\n",
" fixed_C57=fixed_C57,\n",
" mod_seq_columns=mod_seq_columns,\n",
" csv_sep=csv_sep,\n",
" )\n",
"\n",
"\n",
"class DiannReader(SpectronautReader):\n",
" def __init__(self,\n",
" *,\n",
Expand Down Expand Up @@ -126,7 +156,10 @@
" 'spectronaut', SpectronautReader\n",
")\n",
"psm_reader_provider.register_reader(\n",
" 'openswath', SpectronautReader\n",
" 'openswath', SwathReader\n",
")\n",
"psm_reader_provider.register_reader(\n",
" 'swath', SwathReader\n",
")\n",
"psm_reader_provider.register_reader(\n",
" 'diann', DiannReader\n",
Expand Down Expand Up @@ -291,93 +324,7 @@
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>sequence</th>\n",
" <th>charge</th>\n",
" <th>rt</th>\n",
" <th>precursor_mz</th>\n",
" <th>mods</th>\n",
" <th>mod_sites</th>\n",
" <th>nAA</th>\n",
" <th>rt_norm</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>AAAAAAAAAASGAAIPPLIPPRR</td>\n",
" <td>3</td>\n",
" <td>-10.0</td>\n",
" <td>685.732240</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>23</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>AAAAAAAAAASGAAIPPLIPPRR</td>\n",
" <td>4</td>\n",
" <td>59.2</td>\n",
" <td>514.550999</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>23</td>\n",
" <td>0.618962</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>AAAAAAAAAASGAAIPPLIPPRRVITLYQCFSVSQR</td>\n",
" <td>5</td>\n",
" <td>101.8</td>\n",
" <td>728.201724</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>36</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" sequence charge rt precursor_mz mods \\\n",
"0 AAAAAAAAAASGAAIPPLIPPRR 3 -10.0 685.732240 \n",
"1 AAAAAAAAAASGAAIPPLIPPRR 4 59.2 514.550999 \n",
"2 AAAAAAAAAASGAAIPPLIPPRRVITLYQCFSVSQR 5 101.8 728.201724 \n",
"\n",
" mod_sites nAA rt_norm \n",
"0 23 0.000000 \n",
"1 23 0.618962 \n",
"2 36 1.000000 "
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"from io import StringIO\n",
"tsv = StringIO('''PrecursorMz\tProductMz\tTr_recalibrated\ttransition_name\tCE\tLibraryIntensity\ttransition_group_id\tdecoy\tPeptideSequence\tProteinName\tAnnotation\tFullUniModPeptideName\tPrecursorCharge\tGroupLabel\tUniprotID\tFragmentType\tFragmentCharge\tFragmentSeriesNumber\n",
Expand All @@ -396,8 +343,9 @@
"\n",
"\n",
"osw_reader = psm_reader_provider.get_reader('openswath')\n",
"osw_reader.import_file(tsv)\n",
"osw_reader.psm_df"
"psm_df = osw_reader.import_file(tsv)\n",
"assert psm_df.loc[2,'mod_sites'] == '30'\n",
"assert psm_df.loc[2,'mods'] == 'Carbamidomethyl@C'"
]
},
{
Expand Down Expand Up @@ -790,8 +738,7 @@
"F:\\XXX\\20201218_tims03_Evo03_PS_SA_HeLa_200ng_high_speed_21min_8cm_S2-A8_1_22642.d\t20201218_tims03_Evo03_PS_SA_HeLa_200ng_high_speed_21min_8cm_S2-A8_1_22642\tP28482\tP28482\t\tMAPK1\t69911.3\t68996.2\t63388.2\t69911.3\t68996.2\t63388.2\t63388.2\t(UniMod:1)AAAAAAGAGPEM(UniMod:35)VR\tAAAAAAGAGPEMVR\t(UniMod:1)AAAAAAGAGPEM(UniMod:35)VR2\t2\t0.00122498\t0.000834654\t0.000152765\t0.000152765\t0.000146135\t0.000154631\t0\t1\t1572.67\t1552.08\t1572.67\t0.906427\t7.45711\t7.40943\t7.50482\t15.9025\t7.43922\t16.0749\t0\t0.371998\t5937.05\t0.30888\t0.510876\t0.72688\t0.95182\t1.96259\t0.65474\t1320.01;838.009;638.006;827.009;562.005;339.003;\t1320.01;252.656;0;213.073;330.325;0;\t0.976001;0.542934;0.346963;0.38014;0.442774;-0.259898;\t11239\t1.01773\t1.0262\t1.02509\t1.01834\n",
"''')\n",
"diann_reader = psm_reader_provider.get_reader('diann')\n",
"diann_reader.import_file(tsv)\n",
"diann_reader.psm_df"
"diann_reader.import_file(tsv)"
]
},
{
Expand Down Expand Up @@ -866,6 +813,13 @@
" 'S[UniMod:21]'])\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
Loading