levitsky · levitsky · Oct 14, 2020 · Jul 16, 2020 · Oct 8, 2020 · Oct 9, 2020
diff --git a/pyteomics/auxiliary/__init__.py b/pyteomics/auxiliary/__init__.py
@@ -7,7 +7,7 @@
 
 from .structures import (
     PyteomicsError, Charge, ChargeList,
-    _parse_charge, BasicComposition,
+    _parse_charge, _parse_ion, BasicComposition,
     unitfloat, unitint, unitstr, cvstr,
     cvquery)
 

diff --git a/pyteomics/auxiliary/file_helpers.py b/pyteomics/auxiliary/file_helpers.py
diff --git a/pyteomics/auxiliary/structures.py b/pyteomics/auxiliary/structures.py
@@ -1,5 +1,6 @@
 import re
 from collections import defaultdict, Counter
+import warnings
 
 try:
     basestring
@@ -83,6 +84,19 @@ def __str__(self):
         return str(abs(self)) + '+-'[self < 0]
 
 
+class Ion(str):
+    """Represents an Ion, right now just a subclass of String.
+    """
+    _pattern = r'([abcxyz]\d+(\-H2O|\-NH3)?)([\+|-]\d+)'  # "y2-H2O+1"
+
+    def __init__(self, *args, **kwargs):
+        if args and isinstance(args[0], basestring):
+            try:
+                self.ion_type, self.neutral_loss, self.charge = re.match(self._pattern, args[0]).groups()
+            except Exception:
+                raise PyteomicsError("Malformed ion string, must match the regex {!r}".format(self._pattern))
+
+
 class ChargeList(list):
     """Just a list of :py:class:`Charge`s. When printed, looks like an
     enumeration of the list contents. Can also be constructed from such
@@ -118,6 +132,13 @@ def _parse_charge(s, list_only=False):
     return ChargeList(s)
 
 
+def _parse_ion(ion_text):
+    try:
+        return Ion(ion_text)
+    except Exception as e:
+        warnings.warn('Could not parse ion string: {} ({})'.format(ion_text, e.args[0]))
+
+
 class BasicComposition(defaultdict, Counter):
     """A generic dictionary for compositions.
     Keys should be strings, values should be integers.

diff --git a/pyteomics/mgf.py b/pyteomics/mgf.py
@@ -75,6 +75,7 @@
 import sys
 from . import auxiliary as aux
 
+
 class MGFBase(object):
     """Abstract mixin class representing an MGF file. Subclasses implement different approaches to parsing."""
     _comments = set('#;!/')
@@ -84,13 +85,13 @@ class MGFBase(object):
     _array_converters = {
         'm/z array': [_identity, _array, _array],
         'intensity array': [_identity, _array, _array],
-        'charge array': [_identity, _array, _ma]
+        'charge array': [_identity, _array, _ma],
+        'ion array': [_identity, _array,  _array]
     }
-    _array_keys = ['m/z array', 'intensity array', 'charge array']
-    _array_keys_unicode = [u'm/z array', u'intensity array', u'charge array']
+    _array_keys = ['m/z array', 'intensity array', 'charge array', 'ion array']
+    _array_keys_unicode = [u'm/z array', u'intensity array', u'charge array', u'ion array']
     encoding = None
 
-
     def __init__(self, source=None, **kwargs):
         """Create an MGF file object, set MGF-specific parameters.
 
@@ -107,17 +108,21 @@ def __init__(self, source=None, **kwargs):
             Default is :py:const:`True`.
 
         convert_arrays : one of {0, 1, 2}, optional, keyword only
-            If `0`, m/z, intensities and (possibly) charges will be returned as regular lists.
+            If `0`, m/z, intensities and (possibly) charges or (possibly) ions will be returned as regular lists
             If `1`, they will be converted to regular :py:class:`numpy.ndarray`'s.
             If `2`, charges will be reported as a masked array (default).
             The default option is the slowest. `1` and `2` require :py:mod:`numpy`.
 
         read_charges : bool, optional, keyword only
             If `True` (default), fragment charges are reported. Disabling it improves performance.
 
+        read_ions : bool, optional
+            If `True` (default: False), fragment ions are reported. Disabling it improves performance.
+            Note that right now, only one of (read_charges, read_ions) may be True.
+
         dtype : type or str or dict, optional, keyword only
             dtype argument to :py:mod:`numpy` array constructor, one for all arrays or one for each key.
-            Keys should be 'm/z array', 'intensity array' and/or 'charge array'.
+            Keys should be 'm/z array', 'intensity array', 'charge array' and/or 'ion array'.
 
         encoding : str, optional, keyword only
             File encoding.
@@ -129,6 +134,10 @@ def __init__(self, source=None, **kwargs):
         if self._convert_arrays and np is None:
             raise aux.PyteomicsError('numpy is required for array conversion')
         self._read_charges = kwargs.pop('read_charges', True)
+        self._read_ions = kwargs.pop('read_ions', False)
+        # Make sure no charges are read if ions are read
+        if self._read_ions:
+            self._read_charges = False
         dtype = kwargs.pop('dtype', None)
         self._dtype_dict = dtype if isinstance(dtype, dict) else {k: dtype for k in self._array_keys}
         if self._use_header:
@@ -142,6 +151,9 @@ def parse_precursor_charge(self, charge_text, list_only=False):
     def parse_peak_charge(self, charge_text, list_only=False):
         return aux._parse_charge(charge_text, list_only=False)
 
+    def parse_peak_ion(self, ion_text):
+        return aux._parse_ion(ion_text)
+
     @property
     def header(self):
         if self._header is None:
@@ -173,6 +185,7 @@ def _read_spectrum_lines(self, lines):
         masses = []
         intensities = []
         charges = []
+        ions = []
 
         params = self.header.copy() if self._use_header else {}
 
@@ -202,6 +215,8 @@ def _read_spectrum_lines(self, lines):
                 data = {'m/z array': masses, 'intensity array': intensities}
                 if self._read_charges:
                     data['charge array'] = charges
+                if self._read_ions:
+                    data['ion array'] = ions
                 for key, values in data.items():
                     out[key] = self._array_converters[key][self._convert_arrays](values, dtype=self._dtype_dict.get(key))
                 if self.encoding and sys.version_info.major == 2:
@@ -211,16 +226,18 @@ def _read_spectrum_lines(self, lines):
                 return out
 
             else:
-                if '=' in sline: # spectrum-specific parameters!
+                if '=' in sline:  # spectrum-specific parameters!
                     l = sline.split('=', 1)
                     params[l[0].lower()] = l[1].strip()
-                else: # this must be a peak list
+                else:  # this must be a peak list
                     l = sline.split()
                     try:
                         masses.append(float(l[0]))
                         intensities.append(float(l[1]))
                         if self._read_charges:
                             charges.append(self.parse_peak_charge(l[2]) if len(l) > 2 else 0)
+                        if self._read_ions:
+                            ions.append(self.parse_peak_ion(l[2]) if len(l) > 2 else "")
                     except ValueError:
                         raise aux.PyteomicsError(
                              'Error when parsing %s. Line:\n%s' % (getattr(self._source, 'name', 'MGF file'), line))
@@ -238,10 +255,11 @@ class IndexedMGF(MGFBase, aux.TaskMappingMixin, aux.TimeOrderedIndexedReaderMixi
     If created using a file object, it needs to be opened in binary mode.
 
     When iterated, :py:class:`IndexedMGF` object yields spectra one by one.
-    Each 'spectrum' is a :py:class:`dict` with four keys: 'm/z array',
-    'intensity array', 'charge array' and 'params'. 'm/z array' and
+    Each 'spectrum' is a :py:class:`dict` with five keys: 'm/z array',
+    'intensity array', 'charge array', 'ion array' and 'params'. 'm/z array' and
     'intensity array' store :py:class:`numpy.ndarray`'s of floats,
     'charge array' is a masked array (:py:class:`numpy.ma.MaskedArray`) of ints,
+    'ion_array' is an array of Ions (str)
     and 'params' stores a :py:class:`dict` of parameters (keys and values are
     :py:class:`str`, keys corresponding to MGF, lowercased).
 
@@ -254,38 +272,41 @@ class IndexedMGF(MGFBase, aux.TaskMappingMixin, aux.TimeOrderedIndexedReaderMixi
     time : RTLocator
         A property used for accessing spectra by retention time.
     """
-
     delimiter = 'BEGIN IONS'
-    label = r'TITLE=([^\n]*\S)\s*'
 
     def __init__(self, source=None, use_header=True, convert_arrays=2, read_charges=True,
-        dtype=None, encoding='utf-8', _skip_index=False, **kwargs):
+                 dtype=None, encoding='utf-8', index_by_scans=False, read_ions=False, _skip_index=False, **kwargs):
+        self.label = r'SCANS=(\d+)\s*' if index_by_scans else r'TITLE=([^\n]*\S)\s*'
         super(IndexedMGF, self).__init__(source, parser_func=self._read, pass_file=False, args=(), kwargs={},
-            use_header=use_header, convert_arrays=convert_arrays, read_charges=read_charges,
-            dtype=dtype, encoding=encoding, _skip_index=_skip_index, **kwargs)
+                                         use_header=use_header, convert_arrays=convert_arrays,
+                                         read_charges=read_charges,
+                                         dtype=dtype, encoding=encoding, read_ions=read_ions, _skip_index=_skip_index,
+                                         **kwargs)
 
     def __reduce_ex__(self, protocol):
         return (self.__class__,
-            (self._source_init, False, self._convert_arrays, self._read_charges,
-                self._dtype_dict, self.encoding, True),
-            self.__getstate__())
+                (self._source_init, False, self._convert_arrays, self._read_charges,
+                 self._dtype_dict, self.encoding, True),
+                self.__getstate__())
 
     def __getstate__(self):
         state = super(IndexedMGF, self).__getstate__()
         state['use_header'] = self._use_header
         state['header'] = self._header
+        state['read_ions'] = self._read_ions
         return state
 
     def __setstate__(self, state):
         super(IndexedMGF, self).__setstate__(state)
         self._header = state['header']
         self._use_header = state['use_header']
+        self._read_ions = state['read_ions']
 
     @aux._keepstate_method
     def _read_header(self):
         try:
             first = next(v for v in self._offset_index.values())[0]
-        except StopIteration: # the index is empty, no spectra in file
+        except StopIteration:  # the index is empty, no spectra in file
             first = -1
         header_lines = self.read(first).decode(self.encoding).split('\n')
         return self._read_header_lines(header_lines)
@@ -318,10 +339,11 @@ class MGF(MGFBase, aux.FileReader):
     constant-time access to spectra.
 
     :py:class:`MGF` object behaves as an iterator, **yielding** spectra one by one.
-    Each 'spectrum' is a :py:class:`dict` with four keys: 'm/z array',
-    'intensity array', 'charge array' and 'params'. 'm/z array' and
+    Each 'spectrum' is a :py:class:`dict` with five keys: 'm/z array',
+    'intensity array', 'charge array', 'ion array' and 'params'. 'm/z array' and
     'intensity array' store :py:class:`numpy.ndarray`'s of floats,
     'charge array' is a masked array (:py:class:`numpy.ma.MaskedArray`) of ints,
+    'ion_array' is a masked array of Ions (str)
     and 'params' stores a :py:class:`dict` of parameters (keys and values are
     :py:class:`str`, keys corresponding to MGF, lowercased).
 
@@ -334,9 +356,10 @@ class MGF(MGFBase, aux.FileReader):
     """
 
     def __init__(self, source=None, use_header=True, convert_arrays=2, read_charges=True,
-        dtype=None, encoding=None):
+            read_ions=False, dtype=None, encoding=None):
         super(MGF, self).__init__(source, mode='r', parser_func=self._read, pass_file=False, args=(), kwargs={},
-            encoding=encoding, use_header=use_header, convert_arrays=convert_arrays, read_charges=read_charges, dtype=dtype)
+            encoding=encoding, use_header=use_header, convert_arrays=convert_arrays, read_charges=read_charges,
+            read_ions=read_ions, dtype=dtype)
         # self.encoding = encoding
 
     @aux._keepstate_method
@@ -391,9 +414,13 @@ def read(*args, **kwargs):
     read_charges : bool, optional
         If `True` (default), fragment charges are reported. Disabling it improves performance.
 
+    read_ions : bool, optional
+        If `True` (default: False), fragment charges are reported. Disabling it improves performance.
+        Note that right now, only one of (read_charges, read_ions) may be True.
+
     dtype : type or str or dict, optional
         dtype argument to :py:mod:`numpy` array constructor, one for all arrays or one for each key.
-        Keys should be 'm/z array', 'intensity array' and/or 'charge array'.
+        Keys should be 'm/z array', 'intensity array', 'charge array' and/or 'ion array'.
 
     encoding : str, optional
         File encoding.
@@ -490,30 +517,33 @@ def read_header(source):
 
 _default_key_order = ['title', 'pepmass', 'rtinseconds', 'charge']
 
+
 def _pepmass_repr(k, pepmass):
     outstr = k.upper() + '='
-    if not isinstance(pepmass, (str, int, float)): # assume iterable
+    if not isinstance(pepmass, (str, int, float)):  # assume iterable
         try:
             outstr += ' '.join(str(x) for x in pepmass if x is not None)
         except TypeError:
-            raise aux.PyteomicsError(
-                    'Cannot handle parameter: PEPMASS = {}'.format(pepmass))
+            raise aux.PyteomicsError('Cannot handle parameter: PEPMASS = {}'.format(pepmass))
     else:
         outstr += str(pepmass)
     return outstr
 
+
 def _charge_repr(k, charge):
     return '{}={}'.format(k.upper(), aux._parse_charge(str(charge)))
 
+
 def _default_repr(key, val):
     return '{}={}'.format(key.upper(), val)
 
+
 _default_value_formatters = {'pepmass': _pepmass_repr, 'charge': _charge_repr}
 
+
 @aux._file_writer()
-def write(spectra, output=None, header='', key_order=_default_key_order,
-    fragment_format=None, write_charges=True, use_numpy=None,
-    param_formatters=_default_value_formatters):
+def write(spectra, output=None, header='', key_order=_default_key_order, fragment_format=None,
+        write_charges=True, write_ions=False, use_numpy=None, param_formatters=_default_value_formatters):
     """
     Create a file in MGF format.
 
@@ -549,8 +579,13 @@ def write(spectra, output=None, header='', key_order=_default_key_order,
         If :py:const:`False`, fragment charges from 'charge array' will not be written.
         Default is :py:const:`True`.
 
+    write_ions : bool, optional
+        If :py:const:`False`, fragment ions from 'ion array' will not be written.
+        If :py:const:`True`, then `write_charges` is set to :py:const:`False`.
+        Default is :py:const:`False`.
+
     fragment_format : str, optional
-        Format string for m/z, intensity and charge of a fragment. Useful to set
+        Format string for m/z, intensity and charge (or ion annotation) of a fragment. Useful to set
         the number of decimal places, e.g.:
         ``fragment_format='%.4f %.0f'``. Default is ``'{} {} {}'``.
 
@@ -609,10 +644,13 @@ def key_value_line(key, val):
         fragment_format = '{} {} {}'
         np_format_2 = '%.5f %.1f'
         np_format_3 = '%.5f %.1f %d'
+        np_format_i = '%.5f %.1f %s'
     else:
-        np_format_2 = np_format_3 = fragment_format
+        np_format_2 = np_format_3 = np_format_i = fragment_format
     format_str = fragment_format + '\n'
 
+    if write_ions:
+        write_charges = False
     if use_numpy is None:
         use_numpy = not write_charges
 
@@ -629,9 +667,8 @@ def key_value_line(key, val):
             head_str = '\n'.join(header)
         head_dict = {}
         for line in head_lines:
-            if not line.strip() or any(
-                line.startswith(c) for c in MGF._comments):
-               continue
+            if not line.strip() or any(line.startswith(c) for c in MGF._comments):
+                continue
             l = line.split('=')
             if len(l) == 2:
                 head_dict[l[0].lower()] = l[1].strip()
@@ -651,7 +688,7 @@ def key_value_line(key, val):
         try:
             success = True
             if np is not None and use_numpy:
-                if not write_charges or 'charge array' not in spectrum:
+                if (not write_charges or 'charge array' not in spectrum) and (not write_ions or 'ion array' not in spectrum):
                     X = np.empty((len(spectrum['m/z array']), 2))
                     X[:, 0] = spectrum['m/z array']
                     X[:, 1] = spectrum['intensity array']
@@ -662,6 +699,12 @@ def key_value_line(key, val):
                     X[:, 1] = spectrum['intensity array']
                     X[:, 2] = spectrum['charge array']
                     np.savetxt(output, X, fmt=np_format_3)
+                elif isinstance(spectrum.get('ion array'), np.ndarray):
+                    X = np.empty((len(spectrum['m/z array']), 3), dtype=object)
+                    X[:, 0] = spectrum['m/z array']
+                    X[:, 1] = spectrum['intensity array']
+                    X[:, 2] = spectrum['ion array']
+                    np.savetxt(output, X, fmt=np_format_i)
                 else:
                     success = False
             else:
@@ -670,7 +713,9 @@ def key_value_line(key, val):
             if not success:
                 for m, i, c in zip(spectrum['m/z array'],
                         spectrum['intensity array'],
-                        spectrum.get('charge array', it.cycle((None,)))):
+                        spectrum.get('charge array', it.cycle((None,))) if write_charges else
+                            spectrum.get('ion array', it.cycle((None,))) if write_ions else
+                            it.cycle((None,))):
                     output.write(format_str.format(
                         m, i,
                         (c if c not in nones else '')))
@@ -679,4 +724,5 @@ def key_value_line(key, val):
         output.write('END IONS\n\n')
     return output
 
-chain = aux._make_chain(read, 'read')
+
+chain = aux._make_chain(read, 'read')