Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parse Annotated MGFs #12

Merged
merged 9 commits into from
Oct 14, 2020
Merged
2 changes: 1 addition & 1 deletion pyteomics/auxiliary/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from .structures import (
PyteomicsError, Charge, ChargeList,
_parse_charge, BasicComposition,
_parse_charge, _parse_ion, BasicComposition,
unitfloat, unitint, unitstr, cvstr,
cvquery)

Expand Down
Empty file modified pyteomics/auxiliary/file_helpers.py
100644 → 100755
Empty file.
21 changes: 21 additions & 0 deletions pyteomics/auxiliary/structures.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import re
from collections import defaultdict, Counter
import warnings

try:
basestring
Expand Down Expand Up @@ -83,6 +84,19 @@ def __str__(self):
return str(abs(self)) + '+-'[self < 0]


class Ion(str):
"""Represents an Ion, right now just a subclass of String.
"""
_pattern = r'([abcxyz]\d+(\-H2O|\-NH3)?)([\+|-]\d+)' # "y2-H2O+1"

def __init__(self, *args, **kwargs):
if args and isinstance(args[0], basestring):
try:
self.ion_type, self.neutral_loss, self.charge = re.match(self._pattern, args[0]).groups()
except Exception:
raise PyteomicsError("Malformed ion string, must match the regex {!r}".format(self._pattern))


class ChargeList(list):
"""Just a list of :py:class:`Charge`s. When printed, looks like an
enumeration of the list contents. Can also be constructed from such
Expand Down Expand Up @@ -118,6 +132,13 @@ def _parse_charge(s, list_only=False):
return ChargeList(s)


def _parse_ion(ion_text):
try:
return Ion(ion_text)
except Exception as e:
warnings.warn('Could not parse ion string: {} ({})'.format(ion_text, e.args[0]))


class BasicComposition(defaultdict, Counter):
"""A generic dictionary for compositions.
Keys should be strings, values should be integers.
Expand Down
122 changes: 84 additions & 38 deletions pyteomics/mgf.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@
import sys
from . import auxiliary as aux


class MGFBase(object):
"""Abstract mixin class representing an MGF file. Subclasses implement different approaches to parsing."""
_comments = set('#;!/')
Expand All @@ -84,13 +85,13 @@ class MGFBase(object):
_array_converters = {
'm/z array': [_identity, _array, _array],
'intensity array': [_identity, _array, _array],
'charge array': [_identity, _array, _ma]
'charge array': [_identity, _array, _ma],
'ion array': [_identity, _array, _array]
}
_array_keys = ['m/z array', 'intensity array', 'charge array']
_array_keys_unicode = [u'm/z array', u'intensity array', u'charge array']
_array_keys = ['m/z array', 'intensity array', 'charge array', 'ion array']
_array_keys_unicode = [u'm/z array', u'intensity array', u'charge array', u'ion array']
encoding = None


def __init__(self, source=None, **kwargs):
"""Create an MGF file object, set MGF-specific parameters.

Expand All @@ -107,17 +108,21 @@ def __init__(self, source=None, **kwargs):
Default is :py:const:`True`.

convert_arrays : one of {0, 1, 2}, optional, keyword only
If `0`, m/z, intensities and (possibly) charges will be returned as regular lists.
If `0`, m/z, intensities and (possibly) charges or (possibly) ions will be returned as regular lists
If `1`, they will be converted to regular :py:class:`numpy.ndarray`'s.
If `2`, charges will be reported as a masked array (default).
The default option is the slowest. `1` and `2` require :py:mod:`numpy`.

read_charges : bool, optional, keyword only
If `True` (default), fragment charges are reported. Disabling it improves performance.

read_ions : bool, optional
If `True` (default: False), fragment ions are reported. Disabling it improves performance.
Note that right now, only one of (read_charges, read_ions) may be True.

dtype : type or str or dict, optional, keyword only
dtype argument to :py:mod:`numpy` array constructor, one for all arrays or one for each key.
Keys should be 'm/z array', 'intensity array' and/or 'charge array'.
Keys should be 'm/z array', 'intensity array', 'charge array' and/or 'ion array'.

encoding : str, optional, keyword only
File encoding.
Expand All @@ -129,6 +134,10 @@ def __init__(self, source=None, **kwargs):
if self._convert_arrays and np is None:
raise aux.PyteomicsError('numpy is required for array conversion')
self._read_charges = kwargs.pop('read_charges', True)
self._read_ions = kwargs.pop('read_ions', False)
# Make sure no charges are read if ions are read
if self._read_ions:
self._read_charges = False
dtype = kwargs.pop('dtype', None)
self._dtype_dict = dtype if isinstance(dtype, dict) else {k: dtype for k in self._array_keys}
if self._use_header:
Expand All @@ -142,6 +151,9 @@ def parse_precursor_charge(self, charge_text, list_only=False):
def parse_peak_charge(self, charge_text, list_only=False):
return aux._parse_charge(charge_text, list_only=False)

def parse_peak_ion(self, ion_text):
return aux._parse_ion(ion_text)

@property
def header(self):
if self._header is None:
Expand Down Expand Up @@ -173,6 +185,7 @@ def _read_spectrum_lines(self, lines):
masses = []
intensities = []
charges = []
ions = []

params = self.header.copy() if self._use_header else {}

Expand Down Expand Up @@ -202,6 +215,8 @@ def _read_spectrum_lines(self, lines):
data = {'m/z array': masses, 'intensity array': intensities}
if self._read_charges:
data['charge array'] = charges
if self._read_ions:
data['ion array'] = ions
for key, values in data.items():
out[key] = self._array_converters[key][self._convert_arrays](values, dtype=self._dtype_dict.get(key))
if self.encoding and sys.version_info.major == 2:
Expand All @@ -211,16 +226,18 @@ def _read_spectrum_lines(self, lines):
return out

else:
if '=' in sline: # spectrum-specific parameters!
if '=' in sline: # spectrum-specific parameters!
l = sline.split('=', 1)
params[l[0].lower()] = l[1].strip()
else: # this must be a peak list
else: # this must be a peak list
l = sline.split()
try:
masses.append(float(l[0]))
intensities.append(float(l[1]))
if self._read_charges:
charges.append(self.parse_peak_charge(l[2]) if len(l) > 2 else 0)
if self._read_ions:
ions.append(self.parse_peak_ion(l[2]) if len(l) > 2 else "")
except ValueError:
raise aux.PyteomicsError(
'Error when parsing %s. Line:\n%s' % (getattr(self._source, 'name', 'MGF file'), line))
Expand All @@ -238,10 +255,11 @@ class IndexedMGF(MGFBase, aux.TaskMappingMixin, aux.TimeOrderedIndexedReaderMixi
If created using a file object, it needs to be opened in binary mode.

When iterated, :py:class:`IndexedMGF` object yields spectra one by one.
Each 'spectrum' is a :py:class:`dict` with four keys: 'm/z array',
'intensity array', 'charge array' and 'params'. 'm/z array' and
Each 'spectrum' is a :py:class:`dict` with five keys: 'm/z array',
'intensity array', 'charge array', 'ion array' and 'params'. 'm/z array' and
'intensity array' store :py:class:`numpy.ndarray`'s of floats,
'charge array' is a masked array (:py:class:`numpy.ma.MaskedArray`) of ints,
'ion_array' is an array of Ions (str)
and 'params' stores a :py:class:`dict` of parameters (keys and values are
:py:class:`str`, keys corresponding to MGF, lowercased).

Expand All @@ -254,38 +272,41 @@ class IndexedMGF(MGFBase, aux.TaskMappingMixin, aux.TimeOrderedIndexedReaderMixi
time : RTLocator
A property used for accessing spectra by retention time.
"""

delimiter = 'BEGIN IONS'
label = r'TITLE=([^\n]*\S)\s*'

def __init__(self, source=None, use_header=True, convert_arrays=2, read_charges=True,
dtype=None, encoding='utf-8', _skip_index=False, **kwargs):
dtype=None, encoding='utf-8', index_by_scans=False, read_ions=False, _skip_index=False, **kwargs):
self.label = r'SCANS=(\d+)\s*' if index_by_scans else r'TITLE=([^\n]*\S)\s*'
super(IndexedMGF, self).__init__(source, parser_func=self._read, pass_file=False, args=(), kwargs={},
use_header=use_header, convert_arrays=convert_arrays, read_charges=read_charges,
dtype=dtype, encoding=encoding, _skip_index=_skip_index, **kwargs)
use_header=use_header, convert_arrays=convert_arrays,
read_charges=read_charges,
dtype=dtype, encoding=encoding, read_ions=read_ions, _skip_index=_skip_index,
**kwargs)

def __reduce_ex__(self, protocol):
return (self.__class__,
(self._source_init, False, self._convert_arrays, self._read_charges,
self._dtype_dict, self.encoding, True),
self.__getstate__())
(self._source_init, False, self._convert_arrays, self._read_charges,
self._dtype_dict, self.encoding, True),
self.__getstate__())

def __getstate__(self):
state = super(IndexedMGF, self).__getstate__()
state['use_header'] = self._use_header
state['header'] = self._header
state['read_ions'] = self._read_ions
return state

def __setstate__(self, state):
super(IndexedMGF, self).__setstate__(state)
self._header = state['header']
self._use_header = state['use_header']
self._read_ions = state['read_ions']

@aux._keepstate_method
def _read_header(self):
try:
first = next(v for v in self._offset_index.values())[0]
except StopIteration: # the index is empty, no spectra in file
except StopIteration: # the index is empty, no spectra in file
first = -1
header_lines = self.read(first).decode(self.encoding).split('\n')
return self._read_header_lines(header_lines)
Expand Down Expand Up @@ -318,10 +339,11 @@ class MGF(MGFBase, aux.FileReader):
constant-time access to spectra.

:py:class:`MGF` object behaves as an iterator, **yielding** spectra one by one.
Each 'spectrum' is a :py:class:`dict` with four keys: 'm/z array',
'intensity array', 'charge array' and 'params'. 'm/z array' and
Each 'spectrum' is a :py:class:`dict` with five keys: 'm/z array',
'intensity array', 'charge array', 'ion array' and 'params'. 'm/z array' and
'intensity array' store :py:class:`numpy.ndarray`'s of floats,
'charge array' is a masked array (:py:class:`numpy.ma.MaskedArray`) of ints,
'ion_array' is a masked array of Ions (str)
and 'params' stores a :py:class:`dict` of parameters (keys and values are
:py:class:`str`, keys corresponding to MGF, lowercased).

Expand All @@ -334,9 +356,10 @@ class MGF(MGFBase, aux.FileReader):
"""

def __init__(self, source=None, use_header=True, convert_arrays=2, read_charges=True,
dtype=None, encoding=None):
read_ions=False, dtype=None, encoding=None):
super(MGF, self).__init__(source, mode='r', parser_func=self._read, pass_file=False, args=(), kwargs={},
encoding=encoding, use_header=use_header, convert_arrays=convert_arrays, read_charges=read_charges, dtype=dtype)
encoding=encoding, use_header=use_header, convert_arrays=convert_arrays, read_charges=read_charges,
read_ions=read_ions, dtype=dtype)
# self.encoding = encoding

@aux._keepstate_method
Expand Down Expand Up @@ -391,9 +414,13 @@ def read(*args, **kwargs):
read_charges : bool, optional
If `True` (default), fragment charges are reported. Disabling it improves performance.

read_ions : bool, optional
If `True` (default: False), fragment charges are reported. Disabling it improves performance.
Note that right now, only one of (read_charges, read_ions) may be True.

dtype : type or str or dict, optional
dtype argument to :py:mod:`numpy` array constructor, one for all arrays or one for each key.
Keys should be 'm/z array', 'intensity array' and/or 'charge array'.
Keys should be 'm/z array', 'intensity array', 'charge array' and/or 'ion array'.

encoding : str, optional
File encoding.
Expand Down Expand Up @@ -490,30 +517,33 @@ def read_header(source):

_default_key_order = ['title', 'pepmass', 'rtinseconds', 'charge']


def _pepmass_repr(k, pepmass):
outstr = k.upper() + '='
if not isinstance(pepmass, (str, int, float)): # assume iterable
if not isinstance(pepmass, (str, int, float)): # assume iterable
try:
outstr += ' '.join(str(x) for x in pepmass if x is not None)
except TypeError:
raise aux.PyteomicsError(
'Cannot handle parameter: PEPMASS = {}'.format(pepmass))
raise aux.PyteomicsError('Cannot handle parameter: PEPMASS = {}'.format(pepmass))
else:
outstr += str(pepmass)
return outstr


def _charge_repr(k, charge):
return '{}={}'.format(k.upper(), aux._parse_charge(str(charge)))


def _default_repr(key, val):
return '{}={}'.format(key.upper(), val)


_default_value_formatters = {'pepmass': _pepmass_repr, 'charge': _charge_repr}


@aux._file_writer()
def write(spectra, output=None, header='', key_order=_default_key_order,
fragment_format=None, write_charges=True, use_numpy=None,
param_formatters=_default_value_formatters):
def write(spectra, output=None, header='', key_order=_default_key_order, fragment_format=None,
write_charges=True, write_ions=False, use_numpy=None, param_formatters=_default_value_formatters):
"""
Create a file in MGF format.

Expand Down Expand Up @@ -549,8 +579,13 @@ def write(spectra, output=None, header='', key_order=_default_key_order,
If :py:const:`False`, fragment charges from 'charge array' will not be written.
Default is :py:const:`True`.

write_ions : bool, optional
If :py:const:`False`, fragment ions from 'ion array' will not be written.
If :py:const:`True`, then `write_charges` is set to :py:const:`False`.
Default is :py:const:`False`.

fragment_format : str, optional
Format string for m/z, intensity and charge of a fragment. Useful to set
Format string for m/z, intensity and charge (or ion annotation) of a fragment. Useful to set
the number of decimal places, e.g.:
``fragment_format='%.4f %.0f'``. Default is ``'{} {} {}'``.

Expand Down Expand Up @@ -609,10 +644,13 @@ def key_value_line(key, val):
fragment_format = '{} {} {}'
np_format_2 = '%.5f %.1f'
np_format_3 = '%.5f %.1f %d'
np_format_i = '%.5f %.1f %s'
else:
np_format_2 = np_format_3 = fragment_format
np_format_2 = np_format_3 = np_format_i = fragment_format
format_str = fragment_format + '\n'

if write_ions:
write_charges = False
if use_numpy is None:
use_numpy = not write_charges

Expand All @@ -629,9 +667,8 @@ def key_value_line(key, val):
head_str = '\n'.join(header)
head_dict = {}
for line in head_lines:
if not line.strip() or any(
line.startswith(c) for c in MGF._comments):
continue
if not line.strip() or any(line.startswith(c) for c in MGF._comments):
continue
l = line.split('=')
if len(l) == 2:
head_dict[l[0].lower()] = l[1].strip()
Expand All @@ -651,7 +688,7 @@ def key_value_line(key, val):
try:
success = True
if np is not None and use_numpy:
if not write_charges or 'charge array' not in spectrum:
if (not write_charges or 'charge array' not in spectrum) and (not write_ions or 'ion array' not in spectrum):
X = np.empty((len(spectrum['m/z array']), 2))
X[:, 0] = spectrum['m/z array']
X[:, 1] = spectrum['intensity array']
Expand All @@ -662,6 +699,12 @@ def key_value_line(key, val):
X[:, 1] = spectrum['intensity array']
X[:, 2] = spectrum['charge array']
np.savetxt(output, X, fmt=np_format_3)
elif isinstance(spectrum.get('ion array'), np.ndarray):
X = np.empty((len(spectrum['m/z array']), 3), dtype=object)
X[:, 0] = spectrum['m/z array']
X[:, 1] = spectrum['intensity array']
X[:, 2] = spectrum['ion array']
np.savetxt(output, X, fmt=np_format_i)
else:
success = False
else:
Expand All @@ -670,7 +713,9 @@ def key_value_line(key, val):
if not success:
for m, i, c in zip(spectrum['m/z array'],
spectrum['intensity array'],
spectrum.get('charge array', it.cycle((None,)))):
spectrum.get('charge array', it.cycle((None,))) if write_charges else
spectrum.get('ion array', it.cycle((None,))) if write_ions else
it.cycle((None,))):
output.write(format_str.format(
m, i,
(c if c not in nones else '')))
Expand All @@ -679,4 +724,5 @@ def key_value_line(key, val):
output.write('END IONS\n\n')
return output

chain = aux._make_chain(read, 'read')

chain = aux._make_chain(read, 'read')
Loading