From c98feca822cf0028f5d43f7586402fed6f0a8436 Mon Sep 17 00:00:00 2001
From: Joshua Klein <mobiusklein@gmail.com>
Date: Sat, 12 Dec 2020 15:45:51 -0500
Subject: [PATCH 01/27] experimenting

---
 proforma_parsing.ipynb | 460 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 460 insertions(+)
 create mode 100644 proforma_parsing.ipynb

diff --git a/proforma_parsing.ipynb b/proforma_parsing.ipynb
new file mode 100644
index 00000000..f87866a1
--- /dev/null
+++ b/proforma_parsing.ipynb
@@ -0,0 +1,460 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 62,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "from collections import namedtuple, defaultdict\n",
+    "from enum import Enum\n",
+    "\n",
+    "from six import add_metaclass"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class PrefixSavingMeta(type):\n",
+    "    def __new__(mcs, name, parents, attrs):\n",
+    "        new_type = type.__new__(mcs, name, parents, attrs)\n",
+    "        prefix = attrs.get(\"prefix_name\")\n",
+    "        if prefix:\n",
+    "            new_type.prefix_map[prefix] = new_type\n",
+    "        short = attrs.get(\"short_prefix\")\n",
+    "        if short:\n",
+    "            new_type.prefix_map[short] = new_type\n",
+    "        return new_type"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 102,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class TagTypeEnum(Enum):\n",
+    "    unimod = 0\n",
+    "    psimod = 1\n",
+    "    massmod = 2\n",
+    "    generic = 3\n",
+    "    info = 4\n",
+    "    gnome = 5\n",
+    "    formula = 6\n",
+    "    glycan = 7\n",
+    "    xlmod = 8\n",
+    "    localization_marker = 9\n",
+    "    group_placeholder = 999\n",
+    "    \n",
+    "\n",
+    "@add_metaclass(PrefixSavingMeta)\n",
+    "class TagBase(object):\n",
+    "    __slots__ = (\"type\", \"value\", \"extra\", \"group_id\")\n",
+    "\n",
+    "    prefix_name = None\n",
+    "    short_prefix = None\n",
+    "    prefix_map = {}\n",
+    "    \n",
+    "    def __init__(self, type, value, extra=None, group_id=None):\n",
+    "        self.type = type\n",
+    "        self.value = value\n",
+    "        self.extra = extra or []\n",
+    "        self.group_id = group_id\n",
+    "\n",
+    "    def __str__(self):\n",
+    "        part = self._format_main()\n",
+    "        if self.extra:\n",
+    "            rest = [str(e) for e in self.extra]\n",
+    "            label = '|'.join([part] + rest)\n",
+    "        else:\n",
+    "            label = part\n",
+    "        if self.group_id:\n",
+    "            label = '%s#%s' % (label, self.group_id)\n",
+    "        return label\n",
+    "    \n",
+    "    def __repr__(self):\n",
+    "        template = \"{self.__class__.__name__}({self.value!r}, {self.extra!r}, {self.group_id!r})\"\n",
+    "        return template.format(self=self)\n",
+    "\n",
+    "\n",
+    "class LocalizationMarker(TagBase):\n",
+    "    __slots__ = ()\n",
+    "    \n",
+    "    def __init__(self, value, extra=None, group_id=None):\n",
+    "        assert group_id is not None\n",
+    "        super(LocalizationMarker, self).__init__(TagTypeEnum.localization_marker, float(value), extra, group_id)\n",
+    "    \n",
+    "    def _format_main(self):\n",
+    "        return \"#{self.group_id}({self.value!f})\".format(self=self)\n",
+    "    \n",
+    "    \n",
+    "class MassModification(TagBase):\n",
+    "    __slots__ = ()\n",
+    "    \n",
+    "    def __init__(self, value, extra=None, group_id=None):\n",
+    "        super(MassModification, self).__init__(TagTypeEnum.massmod, float(value), extra, group_id)\n",
+    "    \n",
+    "    def _format_main(self):\n",
+    "        return '%0.4f' % self.value\n",
+    "\n",
+    "    \n",
+    "class ControlledVocabularyModificationBase(TagBase):\n",
+    "    _tag_type = None\n",
+    "    __slots__ = ()\n",
+    "    \n",
+    "    def __init__(self, value, extra=None, group_id=None):\n",
+    "        super(ControlledVocabularyModificationBase, self).__init__(\n",
+    "            self._tag_type, value, extra, group_id)\n",
+    "\n",
+    "    def _format_main(self):\n",
+    "        return \"{self.prefix_name}:{self.value}\".format(self=self)\n",
+    "\n",
+    "    \n",
+    "class GenericModification(TagBase):\n",
+    "    __slots__ = ()\n",
+    "    \n",
+    "    def __init__(self, value, extra=None, group_id=None):\n",
+    "        super(GenericModification, self).__init__(TagTypeEnum.generic, value, extra, group_id)\n",
+    "    \n",
+    "    def _format_main(self):\n",
+    "        return self.value\n",
+    "    \n",
+    "\n",
+    "class UnimodModification(ControlledVocabularyModificationBase):\n",
+    "    __slots__ = ()\n",
+    "    \n",
+    "    prefix_name = \"UNIMOD\"\n",
+    "    short_prefix = \"U\"\n",
+    "    _tag_type = TagTypeEnum.unimod\n",
+    "\n",
+    "\n",
+    "class PSIModModification(ControlledVocabularyModificationBase):\n",
+    "    __slots__ = ()\n",
+    "    \n",
+    "    prefix_name = \"MOD\"\n",
+    "    short_prefix = 'M'\n",
+    "    _tag_type = TagTypeEnum.psimod\n",
+    "\n",
+    "\n",
+    "class GNOmeModification(ControlledVocabularyModificationBase):\n",
+    "    __slots__ = ()\n",
+    "    \n",
+    "    prefix_name = \"GNO\"\n",
+    "    short_prefix = 'G'\n",
+    "    _tag_type = TagTypeEnum.gnome\n",
+    "\n",
+    "    \n",
+    "class XLMODModification(ControlledVocabularyModificationBase):\n",
+    "    __slots__ = ()\n",
+    "    \n",
+    "    prefix_name = \"XLMOD\"\n",
+    "#     short_prefix = 'XL'\n",
+    "    _tag_type = TagTypeEnum.xlmod\n",
+    "    \n",
+    "    \n",
+    "class TagParserStateEnum(Enum):\n",
+    "    start = 0\n",
+    "    group_id = 1\n",
+    "\n",
+    "def split_tags(tokens):\n",
+    "    starts = [0]\n",
+    "    ends = []\n",
+    "    for i, c in enumerate(tokens):\n",
+    "        if c == '|':\n",
+    "            ends.append(i)\n",
+    "            starts.append(i + 1)\n",
+    "    ends.append(len(tokens))\n",
+    "    out = []\n",
+    "    for i, start in enumerate(starts):\n",
+    "        end = ends[i]\n",
+    "        out.append(tokens[start:end])\n",
+    "    return out\n",
+    "\n",
+    "def find_prefix(tokens):\n",
+    "    for i, c in enumerate(tokens):\n",
+    "        if c == ':':\n",
+    "            return ''.join(tokens[:i]), ''.join(tokens[i + 1:])\n",
+    "    return None, tokens\n",
+    "    \n",
+    "def process_tag_tokens(tokens):\n",
+    "    parts = split_tags(tokens)\n",
+    "    main_tag = parts[0]\n",
+    "    if main_tag[0] in ('+', '-'):\n",
+    "        main_tag = ''.join(main_tag)\n",
+    "        main_tag = MassModification(main_tag)\n",
+    "    else:\n",
+    "        prefix, value = find_prefix(main_tag)\n",
+    "        if prefix is None:\n",
+    "            main_tag = GenericModification(''.join(value))\n",
+    "        else:\n",
+    "            tag_type = TagBase.prefix_map[prefix]\n",
+    "            main_tag = tag_type(value)\n",
+    "    if len(parts) > 1:\n",
+    "        extras = []\n",
+    "        for part in parts:\n",
+    "            prefix, value = find_prefix(part)\n",
+    "            if prefix is None:\n",
+    "                if value.startswith(\"#\"):\n",
+    "                    main_tag.group_id = value\n",
+    "                else:\n",
+    "                    main_tag.extra.append(GenericModification(''.join(value)))\n",
+    "            else:\n",
+    "                tag_type = TagBase.prefix_map[prefix]\n",
+    "                main_tag.extra.append(tag_type(value))\n",
+    "    return main_tag"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 103,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'QWERTYIPASDFGHKLCVNM'"
+      ]
+     },
+     "execution_count": 103,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from pyteomics import parser\n",
+    "''.join(parser.std_amino_acids)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 104,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class ParserStateEnum(Enum):\n",
+    "    before_sequence = 0\n",
+    "    tag_before_sequence = 1\n",
+    "    global_tag = 2\n",
+    "    fixed_spec = 3\n",
+    "    labile_tag = 4\n",
+    "    sequence = 5\n",
+    "    tag_in_sequence = 6\n",
+    "    interval_tag = 7\n",
+    "    tag_after_sequence = 8\n",
+    "    \n",
+    "    done = 999\n",
+    "\n",
+    "\n",
+    "BEFORE = ParserStateEnum.before_sequence\n",
+    "TAG_BEFORE = ParserStateEnum.tag_before_sequence\n",
+    "FIXED = ParserStateEnum.fixed_spec\n",
+    "GLOBAL = ParserStateEnum.global_tag\n",
+    "LABILE = ParserStateEnum.labile_tag\n",
+    "SEQ = ParserStateEnum.sequence\n",
+    "TAG = ParserStateEnum.tag_in_sequence\n",
+    "INTERVAL_TAG = ParserStateEnum.interval_tag\n",
+    "TAG_AFTER = ParserStateEnum.tag_after_sequence\n",
+    "DONE = ParserStateEnum.done\n",
+    "\n",
+    "VALID_AA = set(\"QWERTYIPASDFGHKLCVNM\")\n",
+    "\n",
+    "def tokenize_proforma(sequence):\n",
+    "    labile_modifications = []\n",
+    "    fixed_modifications = []\n",
+    "    unlocalized_modifications = []\n",
+    "    intervals = []\n",
+    "    isotopes = []\n",
+    "    \n",
+    "    n_term = None\n",
+    "    c_term = None\n",
+    "    \n",
+    "    i = 0\n",
+    "    n = len(sequence)\n",
+    "    \n",
+    "    positions = []\n",
+    "    state = BEFORE\n",
+    "    depth = 0\n",
+    "    \n",
+    "    current_aa = None\n",
+    "    current_tag = []\n",
+    "    current_interval = None\n",
+    "    \n",
+    "    while i < n:\n",
+    "        c = sequence[i]\n",
+    "        i += 1\n",
+    "        if state == BEFORE:\n",
+    "            if c == '[':\n",
+    "                state = TAG_BEFORE\n",
+    "                depth = 1\n",
+    "            elif c == '{':\n",
+    "                state = LABILE\n",
+    "                depth = 1\n",
+    "            elif c == '<':\n",
+    "                state = FIXED\n",
+    "            elif c in VALID_AA:\n",
+    "                current_aa = c\n",
+    "                state = SEQ\n",
+    "            elif c == '?':\n",
+    "                if current_tag:\n",
+    "                    unlocalized_modifications.append(process_tag_tokens(current_tag))\n",
+    "                    current_tag = []\n",
+    "                else:\n",
+    "                    raise Exception(\"Error In State {state}, unexpected {c} found at index {i}\".format(**locals()))\n",
+    "            elif c == '-':\n",
+    "                if current_tag:\n",
+    "                    n_term = process_tag_tokens(current_tag)\n",
+    "                    current_tag = []\n",
+    "                else:\n",
+    "                    raise Exception(\"Error In State {state}, unexpected {c} found at index {i}\".format(**locals()))\n",
+    "            else:\n",
+    "                raise Exception(\"Error In State {state}, unexpected {c} found at index {i}\".format(**locals()))\n",
+    "        elif state == SEQ:\n",
+    "            if c in VALID_AA:\n",
+    "                positions.append((current_aa, process_tag_tokens(current_tag) if current_tag else None))\n",
+    "                current_aa = c\n",
+    "                current_tag = []\n",
+    "            elif c == '[':\n",
+    "                state = TAG\n",
+    "                depth = 1\n",
+    "            elif c == '(':\n",
+    "                current_interval = [len(positions), None, None]\n",
+    "            elif c == ')':\n",
+    "                if current_interval is None:\n",
+    "                    raise Exception(\"Error In State {state}, unexpected {c} found at index {i}\".format(**locals()))\n",
+    "                else:\n",
+    "                    current_interval[1] = len(positions)\n",
+    "                    if i >= n or sequence[i] != '[':\n",
+    "                        raise Exception(\"Missing Interval Tag\")\n",
+    "                    i += 1\n",
+    "                    depth = 1\n",
+    "                    state = INTERVAL_TAG\n",
+    "            elif c == '-':\n",
+    "                state = TAG_AFTER\n",
+    "                if i >= n or sequence[i] != '[':\n",
+    "                    raise Exception(\"Missing Interval Tag\")\n",
+    "                i += 1\n",
+    "                depth = 1                \n",
+    "            else:\n",
+    "                raise Exception(\"Error In State {state}, unexpected {c} found at index {i}\".format(**locals()))\n",
+    "        elif state == TAG or state == TAG_BEFORE or state == TAG_AFTER:\n",
+    "            if c == '[':\n",
+    "                depth += 1\n",
+    "            elif c == ']':\n",
+    "                depth -= 1\n",
+    "                if depth <= 0:\n",
+    "                    depth = 0\n",
+    "                    if state == TAG: \n",
+    "                        state = SEQ\n",
+    "                    elif state == TAG_BEFORE:\n",
+    "                        state = BEFORE\n",
+    "                    elif state == TAG_AFTER:\n",
+    "                        c_term = process_tag_tokens(current_tag)\n",
+    "                        state = DONE\n",
+    "            else:\n",
+    "                current_tag.append(c)\n",
+    "        elif state == LABILE:\n",
+    "            if c == '{':\n",
+    "                depth += 1\n",
+    "            elif c == '}':\n",
+    "                depth -= 1\n",
+    "                if depth <= 0:\n",
+    "                    depth = 0\n",
+    "                    labile_modifications.append(process_tag_tokens(current_tag))\n",
+    "                    current_tag = []\n",
+    "                    state = BEFORE\n",
+    "            else:\n",
+    "                current_tag.append(c)\n",
+    "        elif state == INTERVAL_TAG:\n",
+    "            if c == '[':\n",
+    "                depth += 1\n",
+    "            elif c == ']':\n",
+    "                depth -= 1\n",
+    "                if depth <= 0:\n",
+    "                    depth = 0\n",
+    "                    current_interval[2] = process_tag_tokens(current_tag)\n",
+    "                    current_tag = []\n",
+    "                    intervals.append(current_interval)\n",
+    "                    current_interval = None\n",
+    "                    state = SEQ\n",
+    "            else:\n",
+    "                current_tag.append(c)\n",
+    "        else:\n",
+    "            raise Exception(\"Error In State {state}, unexpected {c} found at index {i}\".format(**locals()))\n",
+    "    if current_aa:\n",
+    "        positions.append((current_aa, process_tag_tokens(current_tag) if current_tag else None))\n",
+    "    return positions, {\n",
+    "        'n_term': n_term,\n",
+    "        'c_term': c_term,\n",
+    "        'unlocalized_modifications': unlocalized_modifications,\n",
+    "        'labile_modifications': labile_modifications,\n",
+    "        'intervals': intervals,\n",
+    "    }"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 110,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "([('S', None),\n",
+       "  ('T', UnimodModification('Ox', [], None)),\n",
+       "  ('E', None),\n",
+       "  ('P', None),\n",
+       "  ('P', None),\n",
+       "  ('I', None),\n",
+       "  ('N', None),\n",
+       "  ('G', None)],\n",
+       " {'n_term': GenericModification('Hex', [], None),\n",
+       "  'c_term': None,\n",
+       "  'unlocalized_modifications': [],\n",
+       "  'labile_modifications': [GenericModification('Foo', [], None)],\n",
+       "  'intervals': [[1, 4, MassModification(18.0, [], None)]]})"
+      ]
+     },
+     "execution_count": 110,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "seq, fields = tokenize_proforma(\"{Foo}[Hex]-ST[U:Ox](EPP)[+18]ING\")\n",
+    "seq, fields"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

From c79c3e89212e5fab14401574c9aabc1ab66d10f8 Mon Sep 17 00:00:00 2001
From: Joshua Klein <mobiusklein@gmail.com>
Date: Sat, 19 Dec 2020 23:00:26 -0500
Subject: [PATCH 02/27] A draft parser for ProForma without any semantics on
 the returned object

---
 pyteomics/proforma.py | 815 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 815 insertions(+)
 create mode 100644 pyteomics/proforma.py

diff --git a/pyteomics/proforma.py b/pyteomics/proforma.py
new file mode 100644
index 00000000..26ea3355
--- /dev/null
+++ b/pyteomics/proforma.py
@@ -0,0 +1,815 @@
+'''A simple ProForma lexer
+
+The primary interface is through :func:`parse_proforma`:
+
+    >>> parse_proforma("EM[Oxidation]EVT[#g1(0.01)]S[#g1(0.09)]ES[Phospho#g1(0.90)]PEK")
+        ([('E', None),
+          ('M', GenericModification('Oxidation', None, None)),
+          ('E', None),
+          ('V', None),
+          ('T', LocalizationMarker(0.01, None, '#g1')),
+          ('S', LocalizationMarker(0.09, None, '#g1')),
+          ('E', None),
+          ('S',
+          GenericModification('Phospho', [LocalizationMarker(0.9, None, '#g1')], '#g1')),
+          ('P', None),
+          ('E', None),
+          ('K', None)],
+         {'n_term': None,
+          'c_term': None,
+          'unlocalized_modifications': [],
+          'labile_modifications': [],
+          'fixed_modifications': [],
+          'intervals': [],
+          'isotopes': [],
+          'group_ids': ['#g1']})
+
+'''
+
+import re
+from collections import namedtuple, defaultdict
+
+try:
+    from enum import Enum
+except ImportError:
+    # Python 2 doesn't have a builtin Enum type
+    Enum = object
+
+from six import add_metaclass
+
+from pyteomics import parser
+from pyteomics.mass import Composition
+
+
+class PrefixSavingMeta(type):
+    '''A subclass-registering-metaclass that provides easy
+    lookup of subclasses by prefix attributes.
+    '''
+
+    def __new__(mcs, name, parents, attrs):
+        new_type = type.__new__(mcs, name, parents, attrs)
+        prefix = attrs.get("prefix_name")
+        if prefix:
+            new_type.prefix_map[prefix.lower()] = new_type
+        short = attrs.get("short_prefix")
+        if short:
+            new_type.prefix_map[short.lower()] = new_type
+        return new_type
+
+    def find_by_tag(self, tag_name):
+        if tag_name is None:
+            raise ValueError("tag_name cannot be None!")
+        tag_name = tag_name.lower()
+        return self.prefix_map[tag_name]
+
+
+class TagTypeEnum(Enum):
+    unimod = 0
+    psimod = 1
+    massmod = 2
+    generic = 3
+    info = 4
+    gnome = 5
+    xlmod = 6
+
+    formula = 7
+    glycan = 8
+
+    localization_marker = 9
+    position_label = 10
+    group_placeholder = 999
+
+
+_sentinel = object()
+
+
+@add_metaclass(PrefixSavingMeta)
+class TagBase(object):
+    '''A base class for all tag types.
+
+    Attributes
+    ----------
+    type: Enum
+        An element of :class:`TagTypeEnum` saying what kind of tag this is.
+    value: object
+        The data stored in this tag, usually an externally controlled name
+    extra: list
+        Any extra tags that were nested within this tag. Usually limited to INFO
+        tags but may be other synonymous controlled vocabulary terms.
+    group_id: str or None
+        A short label denoting which group, if any, this tag belongs to
+    '''
+    __slots__ = ("type", "value", "extra", "group_id")
+
+    prefix_name = None
+    short_prefix = None
+    prefix_map = {}
+
+    def __init__(self, type, value, extra=None, group_id=None):
+        self.type = type
+        self.value = value
+        self.extra = extra
+        self.group_id = group_id
+
+    def __str__(self):
+        part = self._format_main()
+        if self.extra:
+            rest = [str(e) for e in self.extra]
+            label = '|'.join([part] + rest)
+        else:
+            label = part
+        if self.group_id:
+            label = '%s%s' % (label, self.group_id)
+        return '%s' % label
+
+    def __repr__(self):
+        template = "{self.__class__.__name__}({self.value!r}, {self.extra!r}, {self.group_id!r})"
+        return template.format(self=self)
+
+    def __eq__(self, other):
+        if other is None:
+            return False
+        return (self.type == other.type) and (self.value == other.value) and (self.extra == other.extra) \
+            and (self.group_id == other.group_id)
+
+    def __ne__(self, other):
+        return not self == other
+
+    def find_extra(self, label):
+        out = []
+        if not self.extra:
+            return out
+        for e in self.extra:
+            if e.type == label:
+                out.append(e)
+        return out
+
+
+class PositionLabelTag(TagBase):
+    '''A tag to mark that a position is involved in a group in some way, but does
+    not imply any specific semantics.
+    '''
+    __slots__ = ()
+
+    def __init__(self, value=None, extra=None, group_id=None):
+        assert group_id is not None
+        super(PositionLabelTag, self).__init__(
+            TagTypeEnum.position_label, group_id, extra, group_id)
+
+    def _format_main(self):
+        return "#{self.group_id}".format(self=self)
+
+
+class LocalizationMarker(TagBase):
+    '''A tag to mark a particular localization site
+    '''
+    __slots__ = ()
+
+    def __init__(self, value, extra=None, group_id=None):
+        assert group_id is not None
+        super(LocalizationMarker, self).__init__(
+            TagTypeEnum.localization_marker, float(value), extra, group_id)
+
+    def _format_main(self):
+        return "#{self.group_id}({self.value!f})".format(self=self)
+
+
+class InformationTag(TagBase):
+    '''A tag carrying free text describing the location
+    '''
+    __slots__ = ()
+
+    prefix_name = "INFO"
+
+    def __init__(self, value, extra=None, group_id=None):
+        super(InformationTag, self).__init__(
+            TagTypeEnum.info, str(value), extra, group_id)
+
+    def _format_main(self):
+        return str(self.value)
+
+
+class MassModification(TagBase):
+    '''A modification defined purely by a signed mass shift in Daltons.
+
+    The value of a :class:`MassModification` is always a :class:`float`
+    '''
+    __slots__ = ()
+
+    def __init__(self, value, extra=None, group_id=None):
+        super(MassModification, self).__init__(
+            TagTypeEnum.massmod, float(value), extra, group_id)
+
+    def _format_main(self):
+        return '%0.4f' % self.value
+
+
+class ModificationBase(TagBase):
+    '''A base class for all modification tags with marked prefixes.
+    '''
+
+    _tag_type = None
+    __slots__ = ()
+
+    def __init__(self, value, extra=None, group_id=None):
+        super(ModificationBase, self).__init__(
+            self._tag_type, value, extra, group_id)
+
+    def _format_main(self):
+        return "{self.prefix_name}:{self.value}".format(self=self)
+
+    def resolve(self):
+        '''Find the term and return it's properties
+        '''
+        raise NotImplementedError()
+
+
+class FormulaModification(ModificationBase):
+    prefix_name = "Formula"
+
+    _tag_type = TagTypeEnum.formula
+
+    def resolve(self):
+        # The handling of fixed isotopes is wrong here as Pyteomics uses a different
+        # convention.
+        from pyteomics.mass import Composition
+        composition = Composition(formula=''.join(self.value.split(" ")))
+        return {
+            "mass": composition.mass(),
+            "composition": composition
+        }
+
+
+class GlycanModification(ModificationBase):
+    prefix_name = "Glycan"
+
+    _tag_type = TagTypeEnum.glycan
+
+
+class GenericModification(TagBase):
+    __slots__ = ()
+
+    def __init__(self, value, extra=None, group_id=None):
+        super(GenericModification, self).__init__(
+            TagTypeEnum.generic, value, extra, group_id)
+
+    def _format_main(self):
+        return self.value
+
+    def resolve(self):
+        '''Find the term, searching through all available vocabularies and
+        return the first match's properties
+        '''
+        raise NotImplementedError()
+
+
+class UnimodModification(ModificationBase):
+    __slots__ = ()
+
+    prefix_name = "UNIMOD"
+    short_prefix = "U"
+    _tag_type = TagTypeEnum.unimod
+
+
+class PSIModModification(ModificationBase):
+    __slots__ = ()
+
+    prefix_name = "MOD"
+    short_prefix = 'M'
+    _tag_type = TagTypeEnum.psimod
+
+
+class GNOmeModification(ModificationBase):
+    __slots__ = ()
+
+    prefix_name = "GNO"
+    # short_prefix = 'G'
+    _tag_type = TagTypeEnum.gnome
+
+
+class XLMODModification(ModificationBase):
+    __slots__ = ()
+
+    prefix_name = "XLMOD"
+    # short_prefix = 'XL'
+    _tag_type = TagTypeEnum.xlmod
+
+
+def split_tags(tokens):
+    '''Split a token array into discrete sets of tag
+    tokens.
+
+    Parameters
+    ----------
+    tokens: list
+        The characters of the tag token buffer
+
+    Returns
+    -------
+    list of list:
+        The tokens for each contained tag
+    '''
+    starts = [0]
+    ends = []
+    for i, c in enumerate(tokens):
+        if c == '|':
+            ends.append(i)
+            starts.append(i + 1)
+        elif (i != 0 and c == '#'):
+            ends.append(i)
+            starts.append(i)
+    ends.append(len(tokens))
+    out = []
+    for i, start in enumerate(starts):
+        end = ends[i]
+        out.append(tokens[start:end])
+    return out
+
+
+def find_prefix(tokens):
+    '''Find the prefix, if any of the tag defined by `tokens`
+    delimited by ":".
+
+    Parameters
+    ----------
+    tokens: list
+        The tag tokens to search
+
+    Returns
+    -------
+    prefix: str or None
+        The prefix string, if found
+    rest: str
+        The rest of the tokens, merged as a string
+    '''
+    for i, c in enumerate(tokens):
+        if c == ':':
+            return ''.join(tokens[:i]), ''.join(tokens[i + 1:])
+    return None, ''.join(tokens)
+
+
+def process_marker(tokens):
+    '''Process a marker, which is a tag whose value starts with #.
+
+    Parameters
+    ----------
+    tokens: list
+        The tag tokens to parse
+
+    Returns
+    -------
+    PositionLabelTag or LocalizationMarker
+    '''
+    if tokens[1:3] == 'XL':
+        return PositionLabelTag(None, group_id=''.join(tokens))
+    else:
+        group_id = None
+        value = None
+        for i, c in enumerate(tokens):
+            if c == '(':
+                group_id = ''.join(tokens[:i])
+                if tokens[-1] != ')':
+                    raise Exception(
+                        "Localization marker with score missing closing parenthesis")
+                value = float(''.join(tokens[i + 1:-1]))
+                return LocalizationMarker(value, group_id=group_id)
+        else:
+            group_id = ''.join(tokens)
+            return PositionLabelTag(group_id=group_id)
+
+
+def process_tag_tokens(tokens):
+    '''Convert a tag token buffer into a parsed :class:`TagBase` instance
+    of the appropriate sub-type with zero or more sub-tags.
+
+    Parameters
+    ----------
+    tokens: list
+        The tokens to parse
+
+    Returns
+    -------
+    TagBase:
+        The parsed tag
+    '''
+    parts = split_tags(tokens)
+    main_tag = parts[0]
+    if main_tag[0] in ('+', '-'):
+        main_tag = ''.join(main_tag)
+        main_tag = MassModification(main_tag)
+    elif main_tag[0] == '#':
+        main_tag = process_marker(main_tag)
+    else:
+        prefix, value = find_prefix(main_tag)
+        if prefix is None:
+            main_tag = GenericModification(''.join(value))
+        else:
+            tag_type = TagBase.find_by_tag(prefix)
+            main_tag = tag_type(value)
+    if len(parts) > 1:
+        extras = []
+        for part in parts[1:]:
+            prefix, value = find_prefix(part)
+            if prefix is None:
+                if value[0] == "#":
+                    marker = process_marker(value)
+                    if isinstance(marker, PositionLabelTag):
+                        main_tag.group_id = ''.join(value)
+                    else:
+                        main_tag.group_id = marker.group_id
+                        extras.append(marker)
+                else:
+                    extras.append(GenericModification(''.join(value)))
+            else:
+                tag_type = TagBase.find_by_tag(prefix)
+                extras.append(tag_type(value))
+        main_tag.extra = extras
+    return main_tag
+
+
+class ModificationRule(object):
+    '''Define a fixed modification rule which dictates a modification tag is
+    always applied at one or more amino acid residues.
+
+    Attributes
+    ----------
+    modification_tag: TagBase
+        The modification to apply
+    targets: list
+        The list of amino acids this applies to
+    '''
+    __slots__ = ('modification_tag', 'targets')
+
+    def __init__(self, modification_tag, targets=None):
+        self.modification_tag = modification_tag
+        self.targets = targets
+
+    def __eq__(self, other):
+        if other is None:
+            return False
+        return self.modification_tag == other.modification_tag and self.targets == other.targets
+
+    def __ne__(self, other):
+        return not self == other
+
+    def __str__(self):
+        targets = ','.join(self.targets)
+        return "<{self.modification_tag}@{targets}>".format(self=self, targets=targets)
+
+    def __repr__(self):
+        return "{self.__class__.__name__}({self.modification_tag!r}, {self.targets})".format(self=self)
+
+
+class StableIsotope(object):
+    '''Define a fixed isotope that is applied globally to all amino acids.
+
+    Attributes
+    ----------
+    isotope: str
+        The stable isotope string, of the form [<isotope-number>]<element> or a special
+        isotopoform's name.
+    '''
+    __slots__ = ('isotope', )
+
+    def __init__(self, isotope):
+        self.isotope = isotope
+
+    def __eq__(self, other):
+        if other is None:
+            return False
+        return self.isotope == other.isotope
+
+    def __ne__(self, other):
+        return not self == other
+
+    def __str__(self):
+        return "<{self.isotope}>".format(self=self)
+
+    def __repr__(self):
+        return "{self.__class__.__name__}({self.isotope})".format(self=self)
+
+
+class TaggedInterval(object):
+    '''Define a fixed interval over the associated sequence which contains the localization
+    of the associated tag.
+
+    Attributes
+    ----------
+    start: int
+        The starting position (inclusive) of the interval along the primary sequence
+    end: int
+        The ending position (exclusive) of the interval along the primary sequence
+    tag: TagBase
+        The tag being localized
+    '''
+    __slots__ = ('start', 'end', 'tag')
+
+    def __init__(self, start, end=None, tag=None):
+        self.start = start
+        self.end = end
+        self.tag = tag
+
+    def __eq__(self, other):
+        if other is None:
+            return False
+        return self.start == other.start and self.end == other.end and self.tag == other.tag
+
+    def __ne__(self, other):
+        return not self == other
+
+    def __str__(self):
+        return "({self.start}-{self.end}){self.tag!r}".format(self=self)
+
+    def __repr__(self):
+        return "{self.__class__.__name__}({self.start}, {self.end}, {self.tag})".format(self=self)
+
+
+class TagParser(object):
+    '''A parser which accumulates tokens until it is asked to parse them into
+    :class:`TagBase` instances.
+
+    Implements a subset of the Sequence protocol.
+
+    Attributes
+    ----------
+    buffer: list
+        The list of tokens accumulated since the last parsing.
+    group_ids: set
+        The set of all group IDs that have been produced so far.
+    '''
+
+    def __init__(self, initial=None, group_ids=None):
+        if initial:
+            self.buffer = list(initial)
+        else:
+            self.buffer = []
+        if group_ids:
+            self.group_ids = set(group_ids)
+        else:
+            self.group_ids = set()
+
+    def append(self, c):
+        '''Append a new character to the buffer.
+
+        Parameters
+        ----------
+        c: str
+            The character appended
+        '''
+        self.buffer.append(c)
+
+    def reset(self):
+        '''Discard the content of the current buffer.
+        '''
+        self.buffer = []
+
+    def __bool__(self):
+        return bool(self.buffer)
+
+    def __iter__(self):
+        return iter(self.buffer)
+
+    def __getitem__(self, i):
+        return self.buffer[i]
+
+    def __len__(self):
+        return len(self.buffer)
+
+    def process(self):
+        '''Parse the content of the internal buffer, clear the buffer,
+        and return the parsed tag.
+
+        Returns
+        -------
+        TagBase
+        '''
+        tag = process_tag_tokens(self.buffer)
+        if tag.group_id:
+            self.group_ids.add(tag.group_id)
+        self.reset()
+        return tag
+
+
+class ParserStateEnum(Enum):
+    before_sequence = 0
+    tag_before_sequence = 1
+    global_tag = 2
+    fixed_spec = 3
+    labile_tag = 4
+    sequence = 5
+    tag_in_sequence = 6
+    interval_tag = 7
+    tag_after_sequence = 8
+    stable_isotope = 9
+
+    done = 999
+
+
+BEFORE = ParserStateEnum.before_sequence
+TAG_BEFORE = ParserStateEnum.tag_before_sequence
+FIXED = ParserStateEnum.fixed_spec
+GLOBAL = ParserStateEnum.global_tag
+ISOTOPE = ParserStateEnum.stable_isotope
+LABILE = ParserStateEnum.labile_tag
+SEQ = ParserStateEnum.sequence
+TAG = ParserStateEnum.tag_in_sequence
+INTERVAL_TAG = ParserStateEnum.interval_tag
+TAG_AFTER = ParserStateEnum.tag_after_sequence
+DONE = ParserStateEnum.done
+
+VALID_AA = set("QWERTYIPASDFGHKLCVNM")
+
+def parse_proforma(sequence):
+    '''Tokenize a ProForma sequence into a sequence of amino acid+tag positions, and a
+    mapping of sequence-spanning modifiers.
+
+    .. note::
+        This is a state machine parser, but with certain sub-state paths
+        unrolled to avoid an explosion of formal intermediary states.
+
+    Parameters
+    ----------
+    sequence: str
+        The sequence to parse
+
+    Returns
+    -------
+    parsed_sequence: list
+        The (amino acid: str, TagBase or None) pairs denoting the positions along the primary sequence
+    modifiers: dict
+        A mapping listing the labile modifications, fixed modifications, stable isotopes, unlocalized
+        modifications, tagged intervals, and group IDs
+    '''
+    labile_modifications = []
+    fixed_modifications = []
+    unlocalized_modifications = []
+    intervals = []
+    isotopes = []
+
+    n_term = None
+    c_term = None
+
+    i = 0
+    n = len(sequence)
+
+    positions = []
+    state = BEFORE
+    depth = 0
+
+    current_aa = None
+    current_tag = TagParser()
+    current_interval = None
+
+    while i < n:
+        c = sequence[i]
+        i += 1
+        if state == BEFORE:
+            if c == '[':
+                state = TAG_BEFORE
+                depth = 1
+            elif c == '{':
+                state = LABILE
+                depth = 1
+            elif c == '<':
+                state = FIXED
+            elif c in VALID_AA:
+                current_aa = c
+                state = SEQ
+            else:
+                raise Exception("Error In State {state}, unexpected {c} found at index {i}".format(**locals()))
+        elif state == SEQ:
+            if c in VALID_AA:
+                positions.append((current_aa, current_tag.process() if current_tag else None))
+                current_aa = c
+            elif c == '[':
+                state = TAG
+                depth = 1
+            elif c == '(':
+                current_interval = TaggedInterval(len(positions) + 1)
+            elif c == ')':
+                if current_interval is None:
+                    raise Exception("Error In State {state}, unexpected {c} found at index {i}".format(**locals()))
+                else:
+                    current_interval.end = len(positions) + 1
+                    if i >= n or sequence[i] != '[':
+                        raise Exception("Missing Interval Tag")
+                    i += 1
+                    depth = 1
+                    state = INTERVAL_TAG
+            elif c == '-':
+                state = TAG_AFTER
+                if i >= n or sequence[i] != '[':
+                    raise Exception("Missing Interval Tag")
+                i += 1
+                depth = 1
+            else:
+                raise Exception("Error In State {state}, unexpected {c} found at index {i}".format(**locals()))
+        elif state == TAG or state == TAG_BEFORE or state == TAG_AFTER or state == GLOBAL:
+            if c == '[':
+                depth += 1
+            elif c == ']':
+                depth -= 1
+                if depth <= 0:
+                    depth = 0
+                    if state == TAG:
+                        state = SEQ
+                    elif state == TAG_BEFORE:
+                        if i < n:
+                            cnext = sequence[i]
+                            if cnext == '?':
+                                unlocalized_modifications.append(current_tag.process())
+                                i += 1
+                            elif cnext == '-':
+                                n_term = current_tag.process()
+                                i += 1
+                            else:
+                                i += 1
+                                raise Exception("Error In State {state}, unexpected {cnext} found at index {i}".format(**locals()))
+
+                        state = BEFORE
+                    elif state == TAG_AFTER:
+                        c_term = current_tag.process()
+                        state = DONE
+                    elif state == GLOBAL:
+                        # Gobble the rest of the global tag inline to avoid spawning
+                        # a whole new state.
+                        if i < n:
+                            c = sequence[i]
+                            i += 1
+                            if c != '@':
+                                raise Exception(
+                                    ("Error In State {state}, fixed modification detected without "
+                                    "target amino acids found at index {i}").format(**locals()))
+                            end = 0
+                            targets = []
+                            while i < n:
+                                c = sequence[i]
+                                i += 1
+                                if c in VALID_AA:
+                                    targets.append(c)
+                                elif c == ',':
+                                    pass
+                                elif '>':
+                                    break
+                            else:
+                                raise Exception(
+                                    ("Error In State {state}, unclosed fixed modification rule").format(**locals()))
+
+                        fixed_modifications.append(
+                            ModificationRule(current_tag.process(), targets))
+                        state = BEFORE
+            else:
+                current_tag.append(c)
+        elif state == FIXED:
+            if c == '[':
+                state = GLOBAL
+            else:
+                state = ISOTOPE
+                current_tag.append(c)
+        elif state == ISOTOPE:
+            if c != '>':
+                current_tag.append(c)
+            else:
+                isotopes.append(StableIsotope(''.join(current_tag)))
+                current_tag.reset()
+                state = BEFORE
+        elif state == LABILE:
+            if c == '{':
+                depth += 1
+            elif c == '}':
+                depth -= 1
+                if depth <= 0:
+                    depth = 0
+                    labile_modifications.append(current_tag.process())
+                    state = BEFORE
+            else:
+                current_tag.append(c)
+        elif state == INTERVAL_TAG:
+            if c == '[':
+                depth += 1
+            elif c == ']':
+                depth -= 1
+                if depth <= 0:
+                    depth = 0
+                    current_interval.tag = current_tag.process()
+                    intervals.append(current_interval)
+                    current_interval = None
+                    state = SEQ
+            else:
+                current_tag.append(c)
+        else:
+            raise Exception("Error In State {state}, unexpected {c} found at index {i}".format(**locals()))
+    if state in (ISOTOPE, TAG, TAG_AFTER, TAG_BEFORE, LABILE, ):
+        raise Exception("Error In State {state}, unclosed group reached end of string!".format(**locals()))
+    if current_aa:
+        positions.append((current_aa, current_tag.process() if current_tag else None))
+    return positions, {
+        'n_term': n_term,
+        'c_term': c_term,
+        'unlocalized_modifications': unlocalized_modifications,
+        'labile_modifications': labile_modifications,
+        'fixed_modifications': fixed_modifications,
+        'intervals': intervals,
+        'isotopes': isotopes,
+        'group_ids': list(current_tag.group_ids)
+    }

From 8c5301edb80a3366f57fc8e166c241c4b3b37b86 Mon Sep 17 00:00:00 2001
From: Joshua Klein <mobiusklein@gmail.com>
Date: Sat, 19 Dec 2020 23:01:00 -0500
Subject: [PATCH 03/27] Updates to the notebook

---
 proforma_parsing.ipynb | 767 ++++++++++++++++++++++++++++++++++++-----
 1 file changed, 676 insertions(+), 91 deletions(-)

diff --git a/proforma_parsing.ipynb b/proforma_parsing.ipynb
index f87866a1..6ca08254 100644
--- a/proforma_parsing.ipynb
+++ b/proforma_parsing.ipynb
@@ -2,38 +2,55 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 62,
+   "execution_count": 172,
    "metadata": {},
    "outputs": [],
    "source": [
     "import re\n",
     "from collections import namedtuple, defaultdict\n",
-    "from enum import Enum\n",
     "\n",
-    "from six import add_metaclass"
+    "try:\n",
+    "    from enum import Enum\n",
+    "except ImportError:\n",
+    "    # Python 2 doesn't have a builtin Enum type\n",
+    "    Enum = object\n",
+    "\n",
+    "from six import add_metaclass\n",
+    "\n",
+    "from pyteomics import parser"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 63,
+   "execution_count": 173,
    "metadata": {},
    "outputs": [],
    "source": [
     "class PrefixSavingMeta(type):\n",
+    "    '''A subclass-registering-metaclass that provides easy\n",
+    "    lookup of subclasses by prefix attributes.\n",
+    "    '''\n",
+    "\n",
     "    def __new__(mcs, name, parents, attrs):\n",
     "        new_type = type.__new__(mcs, name, parents, attrs)\n",
     "        prefix = attrs.get(\"prefix_name\")\n",
     "        if prefix:\n",
-    "            new_type.prefix_map[prefix] = new_type\n",
+    "            new_type.prefix_map[prefix.lower()] = new_type\n",
     "        short = attrs.get(\"short_prefix\")\n",
     "        if short:\n",
-    "            new_type.prefix_map[short] = new_type\n",
-    "        return new_type"
+    "            new_type.prefix_map[short.lower()] = new_type\n",
+    "        return new_type\n",
+    "    \n",
+    "    def find_by_tag(self, tag_name):\n",
+    "        if tag_name is None:\n",
+    "            raise ValueError(\"tag_name cannot be None!\")\n",
+    "        tag_name = tag_name.lower()\n",
+    "        return self.prefix_map[tag_name]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 102,
+   "execution_count": 227,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -44,15 +61,35 @@
     "    generic = 3\n",
     "    info = 4\n",
     "    gnome = 5\n",
-    "    formula = 6\n",
-    "    glycan = 7\n",
-    "    xlmod = 8\n",
+    "    xlmod = 6\n",
+    "\n",
+    "    formula = 7\n",
+    "    glycan = 8\n",
+    "\n",
     "    localization_marker = 9\n",
+    "    position_label = 10\n",
     "    group_placeholder = 999\n",
-    "    \n",
+    "\n",
+    "\n",
+    "_sentinel = object()\n",
+    "\n",
     "\n",
     "@add_metaclass(PrefixSavingMeta)\n",
     "class TagBase(object):\n",
+    "    '''A base class for all tag types.\n",
+    "\n",
+    "    Attributes\n",
+    "    ----------\n",
+    "    type: Enum\n",
+    "        An element of :class:`TagTypeEnum` saying what kind of tag this is.\n",
+    "    value: object\n",
+    "        The data stored in this tag, usually an externally controlled name\n",
+    "    extra: list\n",
+    "        Any extra tags that were nested within this tag. Usually limited to INFO\n",
+    "        tags but may be other synonymous controlled vocabulary terms.\n",
+    "    group_id: str or None\n",
+    "        A short label denoting which group, if any, this tag belongs to\n",
+    "    '''\n",
     "    __slots__ = (\"type\", \"value\", \"extra\", \"group_id\")\n",
     "\n",
     "    prefix_name = None\n",
@@ -62,7 +99,7 @@
     "    def __init__(self, type, value, extra=None, group_id=None):\n",
     "        self.type = type\n",
     "        self.value = value\n",
-    "        self.extra = extra or []\n",
+    "        self.extra = extra\n",
     "        self.group_id = group_id\n",
     "\n",
     "    def __str__(self):\n",
@@ -73,15 +110,49 @@
     "        else:\n",
     "            label = part\n",
     "        if self.group_id:\n",
-    "            label = '%s#%s' % (label, self.group_id)\n",
-    "        return label\n",
+    "            label = '%s%s' % (label, self.group_id)\n",
+    "        return '%s' % label\n",
     "    \n",
     "    def __repr__(self):\n",
     "        template = \"{self.__class__.__name__}({self.value!r}, {self.extra!r}, {self.group_id!r})\"\n",
     "        return template.format(self=self)\n",
+    "    \n",
+    "    def __eq__(self, other):\n",
+    "        if other is None:\n",
+    "            return False\n",
+    "        return (self.type == other.type) and (self.value == other.value) and (self.extra == other.extra) \\\n",
+    "            and (self.group_id == other.group_id)\n",
+    "\n",
+    "    def __ne__(self, other):\n",
+    "        return not self == other\n",
+    "\n",
+    "    def find_extra(self, label):\n",
+    "        out = []\n",
+    "        if not self.extra:\n",
+    "            return out\n",
+    "        for e in self.extra:\n",
+    "            if e.type == label:\n",
+    "                out.append(e)\n",
+    "        return out\n",
+    "\n",
+    "\n",
+    "class PositionLabelTag(TagBase):\n",
+    "    '''A tag to mark that a position is involved in a group in some way, but does\n",
+    "    not imply any specific semantics.\n",
+    "    '''\n",
+    "    __slots__ = ()\n",
+    "\n",
+    "    def __init__(self, value=None, extra=None, group_id=None):\n",
+    "        assert group_id is not None\n",
+    "        super(PositionLabelTag, self).__init__(TagTypeEnum.position_label, group_id, extra, group_id)\n",
+    "    \n",
+    "    def _format_main(self):\n",
+    "        return \"#{self.group_id}\".format(self=self)\n",
     "\n",
     "\n",
     "class LocalizationMarker(TagBase):\n",
+    "    '''A tag to mark a particular localization site \n",
+    "    '''\n",
     "    __slots__ = ()\n",
     "    \n",
     "    def __init__(self, value, extra=None, group_id=None):\n",
@@ -90,9 +161,27 @@
     "    \n",
     "    def _format_main(self):\n",
     "        return \"#{self.group_id}({self.value!f})\".format(self=self)\n",
-    "    \n",
-    "    \n",
+    "\n",
+    "\n",
+    "class InformationTag(TagBase):\n",
+    "    '''A tag carrying free text describing the location\n",
+    "    '''\n",
+    "    __slots__ = ()\n",
+    "\n",
+    "    prefix_name = \"INFO\"\n",
+    "\n",
+    "    def __init__(self, value, extra=None, group_id=None):\n",
+    "        super(InformationTag, self).__init__(TagTypeEnum.info, str(value), extra, group_id)\n",
+    "\n",
+    "    def _format_main(self):\n",
+    "        return str(self.value)\n",
+    "\n",
+    "\n",
     "class MassModification(TagBase):\n",
+    "    '''A modification defined purely by a signed mass shift in Daltons.\n",
+    "\n",
+    "    The value of a :class:`MassModification` is always a :class:`float`\n",
+    "    '''\n",
     "    __slots__ = ()\n",
     "    \n",
     "    def __init__(self, value, extra=None, group_id=None):\n",
@@ -101,17 +190,48 @@
     "    def _format_main(self):\n",
     "        return '%0.4f' % self.value\n",
     "\n",
+    "\n",
     "    \n",
-    "class ControlledVocabularyModificationBase(TagBase):\n",
+    "class ModificationBase(TagBase):\n",
+    "    '''A base class for all modification tags with marked prefixes.\n",
+    "    '''\n",
+    "\n",
     "    _tag_type = None\n",
     "    __slots__ = ()\n",
     "    \n",
     "    def __init__(self, value, extra=None, group_id=None):\n",
-    "        super(ControlledVocabularyModificationBase, self).__init__(\n",
+    "        super(ModificationBase, self).__init__(\n",
     "            self._tag_type, value, extra, group_id)\n",
     "\n",
     "    def _format_main(self):\n",
     "        return \"{self.prefix_name}:{self.value}\".format(self=self)\n",
+    "    \n",
+    "    def resolve(self):\n",
+    "        '''Find the term and return it's properties\n",
+    "        '''\n",
+    "        raise NotImplementedError()\n",
+    "\n",
+    "\n",
+    "class FormulaModification(ModificationBase):\n",
+    "    prefix_name = \"Formula\"\n",
+    "\n",
+    "    _tag_type = TagTypeEnum.formula\n",
+    "\n",
+    "    def resolve(self):\n",
+    "        # The handling of fixed isotopes is wrong here as Pyteomics uses a different\n",
+    "        # convention.\n",
+    "        from pyteomics.mass import Composition\n",
+    "        composition = Composition(formula=''.join(self.value.split(\" \")))\n",
+    "        return {\n",
+    "            \"mass\": composition.mass(),\n",
+    "            \"composition\": composition\n",
+    "        }\n",
+    "\n",
+    "\n",
+    "class GlycanModification(ModificationBase):\n",
+    "    prefix_name = \"Glycan\"\n",
+    "\n",
+    "    _tag_type = TagTypeEnum.glycan\n",
     "\n",
     "    \n",
     "class GenericModification(TagBase):\n",
@@ -122,9 +242,15 @@
     "    \n",
     "    def _format_main(self):\n",
     "        return self.value\n",
-    "    \n",
     "\n",
-    "class UnimodModification(ControlledVocabularyModificationBase):\n",
+    "    def resolve(self):\n",
+    "        '''Find the term, searching through all available vocabularies and\n",
+    "        return the first match's properties\n",
+    "        '''\n",
+    "        raise NotImplementedError()\n",
+    "\n",
+    "\n",
+    "class UnimodModification(ModificationBase):\n",
     "    __slots__ = ()\n",
     "    \n",
     "    prefix_name = \"UNIMOD\"\n",
@@ -132,7 +258,7 @@
     "    _tag_type = TagTypeEnum.unimod\n",
     "\n",
     "\n",
-    "class PSIModModification(ControlledVocabularyModificationBase):\n",
+    "class PSIModModification(ModificationBase):\n",
     "    __slots__ = ()\n",
     "    \n",
     "    prefix_name = \"MOD\"\n",
@@ -140,33 +266,45 @@
     "    _tag_type = TagTypeEnum.psimod\n",
     "\n",
     "\n",
-    "class GNOmeModification(ControlledVocabularyModificationBase):\n",
+    "class GNOmeModification(ModificationBase):\n",
     "    __slots__ = ()\n",
     "    \n",
     "    prefix_name = \"GNO\"\n",
-    "    short_prefix = 'G'\n",
+    "    # short_prefix = 'G'\n",
     "    _tag_type = TagTypeEnum.gnome\n",
     "\n",
     "    \n",
-    "class XLMODModification(ControlledVocabularyModificationBase):\n",
+    "class XLMODModification(ModificationBase):\n",
     "    __slots__ = ()\n",
     "    \n",
     "    prefix_name = \"XLMOD\"\n",
-    "#     short_prefix = 'XL'\n",
+    "    # short_prefix = 'XL'\n",
     "    _tag_type = TagTypeEnum.xlmod\n",
-    "    \n",
-    "    \n",
-    "class TagParserStateEnum(Enum):\n",
-    "    start = 0\n",
-    "    group_id = 1\n",
+    "\n",
     "\n",
     "def split_tags(tokens):\n",
+    "    '''Split a token array into discrete sets of tag\n",
+    "    tokens.\n",
+    "\n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    tokens: list\n",
+    "        The characters of the tag token buffer\n",
+    "    \n",
+    "    Returns\n",
+    "    -------\n",
+    "    list of list:\n",
+    "        The tokens for each contained tag\n",
+    "    '''\n",
     "    starts = [0]\n",
     "    ends = []\n",
     "    for i, c in enumerate(tokens):\n",
     "        if c == '|':\n",
     "            ends.append(i)\n",
     "            starts.append(i + 1)\n",
+    "        elif (i != 0 and c == '#'):\n",
+    "            ends.append(i)\n",
+    "            starts.append(i)\n",
     "    ends.append(len(tokens))\n",
     "    out = []\n",
     "    for i, start in enumerate(starts):\n",
@@ -174,64 +312,292 @@
     "        out.append(tokens[start:end])\n",
     "    return out\n",
     "\n",
+    "\n",
     "def find_prefix(tokens):\n",
+    "    '''Find the prefix, if any of the tag defined by `tokens`\n",
+    "    delimited by \":\".\n",
+    "\n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    tokens: list\n",
+    "        The tag tokens to search\n",
+    "    \n",
+    "    Returns\n",
+    "    -------\n",
+    "    prefix: str or None\n",
+    "        The prefix string, if found\n",
+    "    rest: str\n",
+    "        The rest of the tokens, merged as a string\n",
+    "    '''\n",
     "    for i, c in enumerate(tokens):\n",
     "        if c == ':':\n",
     "            return ''.join(tokens[:i]), ''.join(tokens[i + 1:])\n",
     "    return None, tokens\n",
-    "    \n",
+    "\n",
+    "def process_marker(tokens):\n",
+    "    '''Process a marker, which is a tag whose value starts with #.\n",
+    "\n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    tokens: list\n",
+    "        The tag tokens to parse\n",
+    "\n",
+    "    Returns\n",
+    "    -------\n",
+    "    PositionLabelTag or LocalizationMarker\n",
+    "    '''\n",
+    "    if tokens[1:3] == 'XL':\n",
+    "        return PositionLabelTag(None, group_id=''.join(tokens))\n",
+    "    else:\n",
+    "        group_id = None\n",
+    "        value = None\n",
+    "        for i, c in  enumerate(tokens):\n",
+    "            if c == '(':\n",
+    "                group_id = ''.join(tokens[:i])\n",
+    "                if tokens[-1] != ')':\n",
+    "                    raise Exception(\"Localization marker with score missing closing parenthesis\")\n",
+    "                value = float(''.join(tokens[i + 1:-1]))\n",
+    "                return LocalizationMarker(value, group_id=group_id)\n",
+    "        else:\n",
+    "            group_id = ''.join(tokens)\n",
+    "            return PositionLabelTag(group_id=group_id)\n",
+    "        \n",
+    "\n",
+    "\n",
     "def process_tag_tokens(tokens):\n",
+    "    '''Convert a tag token buffer into a parsed :class:`TagBase` instance\n",
+    "    of the appropriate sub-type with zero or more sub-tags.\n",
+    "\n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    tokens: list\n",
+    "        The tokens to parse\n",
+    "    \n",
+    "    Returns\n",
+    "    -------\n",
+    "    TagBase:\n",
+    "        The parsed tag\n",
+    "    '''\n",
     "    parts = split_tags(tokens)\n",
     "    main_tag = parts[0]\n",
     "    if main_tag[0] in ('+', '-'):\n",
     "        main_tag = ''.join(main_tag)\n",
     "        main_tag = MassModification(main_tag)\n",
+    "    elif main_tag[0] == '#':\n",
+    "        main_tag = process_marker(main_tag)\n",
     "    else:\n",
     "        prefix, value = find_prefix(main_tag)\n",
     "        if prefix is None:\n",
     "            main_tag = GenericModification(''.join(value))\n",
     "        else:\n",
-    "            tag_type = TagBase.prefix_map[prefix]\n",
+    "            tag_type = TagBase.find_by_tag(prefix)\n",
     "            main_tag = tag_type(value)\n",
     "    if len(parts) > 1:\n",
     "        extras = []\n",
-    "        for part in parts:\n",
+    "        for part in parts[1:]:\n",
     "            prefix, value = find_prefix(part)\n",
     "            if prefix is None:\n",
-    "                if value.startswith(\"#\"):\n",
-    "                    main_tag.group_id = value\n",
+    "                if value[0] == \"#\":\n",
+    "                    marker = process_marker(value)\n",
+    "                    if isinstance(marker, PositionLabelTag):\n",
+    "                        main_tag.group_id = ''.join(value)\n",
+    "                    else:\n",
+    "                        main_tag.group_id = marker.group_id\n",
+    "                        extras.append(marker)\n",
     "                else:\n",
-    "                    main_tag.extra.append(GenericModification(''.join(value)))\n",
+    "                    extras.append(GenericModification(''.join(value)))\n",
     "            else:\n",
-    "                tag_type = TagBase.prefix_map[prefix]\n",
-    "                main_tag.extra.append(tag_type(value))\n",
+    "                tag_type = TagBase.find_by_tag(prefix)\n",
+    "                extras.append(tag_type(value))\n",
+    "        main_tag.extra = extras\n",
     "    return main_tag"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 103,
+   "execution_count": 228,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'QWERTYIPASDFGHKLCVNM'"
-      ]
-     },
-     "execution_count": 103,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
-    "from pyteomics import parser\n",
-    "''.join(parser.std_amino_acids)"
+    "class ModificationRule(object):\n",
+    "    '''Define a fixed modification rule which dictates a modification tag is\n",
+    "    always applied at one or more amino acid residues.\n",
+    "\n",
+    "    Attributes\n",
+    "    ----------\n",
+    "    modification_tag: TagBase\n",
+    "        The modification to apply\n",
+    "    targets: list\n",
+    "        The list of amino acids this applies to\n",
+    "    '''\n",
+    "    __slots__ = ('modification_tag', 'targets')\n",
+    "\n",
+    "    def __init__(self, modification_tag, targets=None):\n",
+    "        self.modification_tag = modification_tag\n",
+    "        self.targets = targets\n",
+    "\n",
+    "    def __eq__(self, other):\n",
+    "        if other is None:\n",
+    "            return False\n",
+    "        return self.modification_tag == other.modification_tag and self.targets == other.targets\n",
+    "\n",
+    "    def __ne__(self, other):\n",
+    "        return not self == other\n",
+    "\n",
+    "    def __str__(self):\n",
+    "        targets = ','.join(self.targets)\n",
+    "        return \"<{self.modification_tag}@{targets}>\".format(self=self, targets=targets)\n",
+    "\n",
+    "    def __repr__(self):\n",
+    "        return \"{self.__class__.__name__}({self.modification_tag!r}, {self.targets})\".format(self=self)\n",
+    "\n",
+    "\n",
+    "class StableIsotope(object):\n",
+    "    '''Define a fixed isotope that is applied globally to all amino acids.\n",
+    "\n",
+    "    Attributes\n",
+    "    ----------\n",
+    "    isotope: str\n",
+    "        The stable isotope string, of the form [<isotope-number>]<element> or a special\n",
+    "        isotopoform's name.\n",
+    "    '''\n",
+    "    __slots__ = ('isotope', )\n",
+    "\n",
+    "    def __init__(self, isotope):\n",
+    "        self.isotope = isotope\n",
+    "\n",
+    "    def __eq__(self, other):\n",
+    "        if other is None:\n",
+    "            return False\n",
+    "        return self.isotope == other.isotope\n",
+    "\n",
+    "    def __ne__(self, other):\n",
+    "        return not self == other\n",
+    "\n",
+    "    def __str__(self):\n",
+    "        return \"<{self.isotope}>\".format(self=self)\n",
+    "\n",
+    "    def __repr__(self):\n",
+    "        return \"{self.__class__.__name__}({self.isotope})\".format(self=self)\n",
+    "\n",
+    "\n",
+    "class TaggedInterval(object):\n",
+    "    '''Define a fixed interval over the associated sequence which contains the localization\n",
+    "    of the associated tag.\n",
+    "\n",
+    "    Attributes\n",
+    "    ----------\n",
+    "    start: int\n",
+    "        The starting position (inclusive) of the interval along the primary sequence\n",
+    "    end: int\n",
+    "        The ending position (exclusive) of the interval along the primary sequence\n",
+    "    tag: TagBase\n",
+    "        The tag being localized\n",
+    "    '''\n",
+    "    __slots__ = ('start', 'end', 'tag')\n",
+    "\n",
+    "    def __init__(self, start, end=None, tag=None):\n",
+    "        self.start = start\n",
+    "        self.end = end\n",
+    "        self.tag = tag\n",
+    "    \n",
+    "    def __eq__(self, other):\n",
+    "        if other is None:\n",
+    "            return False\n",
+    "        return self.start == other.start and self.end == other.end and self.tag == other.tag\n",
+    "\n",
+    "    def __ne__(self, other):\n",
+    "        return not self == other\n",
+    "\n",
+    "    def __str__(self):\n",
+    "        return \"({self.start}-{self.end}){self.tag!r}\".format(self=self)\n",
+    "\n",
+    "    def __repr__(self):\n",
+    "        return \"{self.__class__.__name__}({self.start}, {self.end}, {self.tag})\".format(self=self)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 229,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class TagParser(object):\n",
+    "    '''A parser which accumulates tokens until it is asked to parse them into\n",
+    "    :class:`TagBase` instances.\n",
+    "\n",
+    "    Implements a subset of the Sequence protocol.\n",
+    "\n",
+    "    Attributes\n",
+    "    ----------\n",
+    "    buffer: list\n",
+    "        The list of tokens accumulated since the last parsing.\n",
+    "    group_ids: set\n",
+    "        The set of all group IDs that have been produced so far.\n",
+    "    '''\n",
+    "\n",
+    "    def __init__(self, initial=None, group_ids=None):\n",
+    "        if initial:\n",
+    "            self.buffer = list(initial)\n",
+    "        else:\n",
+    "            self.buffer = []\n",
+    "        if group_ids:\n",
+    "            self.group_ids = set(group_ids)\n",
+    "        else:\n",
+    "            self.group_ids = set()\n",
+    "    \n",
+    "    def append(self, c):\n",
+    "        '''Append a new character to the buffer.\n",
+    "\n",
+    "        Parameters\n",
+    "        ----------\n",
+    "        c: str\n",
+    "            The character appended\n",
+    "        '''\n",
+    "        self.buffer.append(c)\n",
+    "    \n",
+    "    def reset(self):\n",
+    "        '''Discard the content of the current buffer.\n",
+    "        '''\n",
+    "        self.buffer = []\n",
+    "    \n",
+    "    def __bool__(self):\n",
+    "        return bool(self.buffer)\n",
+    "    \n",
+    "    def __iter__(self):\n",
+    "        return iter(self.buffer)\n",
+    "\n",
+    "    def __getitem__(self, i):\n",
+    "        return self.buffer[i]\n",
+    "    \n",
+    "    def __len__(self):\n",
+    "        return len(self.buffer)\n",
+    "\n",
+    "    def process(self):\n",
+    "        '''Parse the content of the internal buffer, clear the buffer,\n",
+    "        and return the parsed tag.\n",
+    "\n",
+    "        Returns\n",
+    "        -------\n",
+    "        TagBase\n",
+    "        '''\n",
+    "        tag = process_tag_tokens(self.buffer)\n",
+    "        if tag.group_id:\n",
+    "            self.group_ids.add(tag.group_id)\n",
+    "        self.reset()\n",
+    "        return tag"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 104,
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 230,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -245,7 +611,8 @@
     "    tag_in_sequence = 6\n",
     "    interval_tag = 7\n",
     "    tag_after_sequence = 8\n",
-    "    \n",
+    "    stable_isotope = 9\n",
+    "\n",
     "    done = 999\n",
     "\n",
     "\n",
@@ -253,6 +620,7 @@
     "TAG_BEFORE = ParserStateEnum.tag_before_sequence\n",
     "FIXED = ParserStateEnum.fixed_spec\n",
     "GLOBAL = ParserStateEnum.global_tag\n",
+    "ISOTOPE = ParserStateEnum.stable_isotope\n",
     "LABILE = ParserStateEnum.labile_tag\n",
     "SEQ = ParserStateEnum.sequence\n",
     "TAG = ParserStateEnum.tag_in_sequence\n",
@@ -262,7 +630,27 @@
     "\n",
     "VALID_AA = set(\"QWERTYIPASDFGHKLCVNM\")\n",
     "\n",
-    "def tokenize_proforma(sequence):\n",
+    "def parse_proforma(sequence):\n",
+    "    '''Tokenize a ProForma sequence into a sequence of amino acid+tag positions, and a\n",
+    "    mapping of sequence-spanning modifiers.\n",
+    "\n",
+    "    .. note::\n",
+    "        This is a state machine parser, but with certain sub-state paths\n",
+    "        unrolled to avoid an explosion of formal intermediary states.\n",
+    "\n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    sequence: str\n",
+    "        The sequence to parse\n",
+    "    \n",
+    "    Returns\n",
+    "    -------\n",
+    "    parsed_sequence: list\n",
+    "        The (amino acid: str, TagBase or None) pairs denoting the positions along the primary sequence\n",
+    "    modifiers: dict\n",
+    "        A mapping listing the labile modifications, fixed modifications, stable isotopes, unlocalized\n",
+    "        modifications, tagged intervals, and group IDs\n",
+    "    '''\n",
     "    labile_modifications = []\n",
     "    fixed_modifications = []\n",
     "    unlocalized_modifications = []\n",
@@ -280,7 +668,7 @@
     "    depth = 0\n",
     "    \n",
     "    current_aa = None\n",
-    "    current_tag = []\n",
+    "    current_tag = TagParser()\n",
     "    current_interval = None\n",
     "    \n",
     "    while i < n:\n",
@@ -298,35 +686,22 @@
     "            elif c in VALID_AA:\n",
     "                current_aa = c\n",
     "                state = SEQ\n",
-    "            elif c == '?':\n",
-    "                if current_tag:\n",
-    "                    unlocalized_modifications.append(process_tag_tokens(current_tag))\n",
-    "                    current_tag = []\n",
-    "                else:\n",
-    "                    raise Exception(\"Error In State {state}, unexpected {c} found at index {i}\".format(**locals()))\n",
-    "            elif c == '-':\n",
-    "                if current_tag:\n",
-    "                    n_term = process_tag_tokens(current_tag)\n",
-    "                    current_tag = []\n",
-    "                else:\n",
-    "                    raise Exception(\"Error In State {state}, unexpected {c} found at index {i}\".format(**locals()))\n",
     "            else:\n",
     "                raise Exception(\"Error In State {state}, unexpected {c} found at index {i}\".format(**locals()))\n",
     "        elif state == SEQ:\n",
     "            if c in VALID_AA:\n",
-    "                positions.append((current_aa, process_tag_tokens(current_tag) if current_tag else None))\n",
+    "                positions.append((current_aa, current_tag.process() if current_tag else None))\n",
     "                current_aa = c\n",
-    "                current_tag = []\n",
     "            elif c == '[':\n",
     "                state = TAG\n",
     "                depth = 1\n",
     "            elif c == '(':\n",
-    "                current_interval = [len(positions), None, None]\n",
+    "                current_interval = TaggedInterval(len(positions) + 1)\n",
     "            elif c == ')':\n",
     "                if current_interval is None:\n",
     "                    raise Exception(\"Error In State {state}, unexpected {c} found at index {i}\".format(**locals()))\n",
     "                else:\n",
-    "                    current_interval[1] = len(positions)\n",
+    "                    current_interval.end = len(positions) + 1\n",
     "                    if i >= n or sequence[i] != '[':\n",
     "                        raise Exception(\"Missing Interval Tag\")\n",
     "                    i += 1\n",
@@ -340,7 +715,7 @@
     "                depth = 1                \n",
     "            else:\n",
     "                raise Exception(\"Error In State {state}, unexpected {c} found at index {i}\".format(**locals()))\n",
-    "        elif state == TAG or state == TAG_BEFORE or state == TAG_AFTER:\n",
+    "        elif state == TAG or state == TAG_BEFORE or state == TAG_AFTER or state == GLOBAL:\n",
     "            if c == '[':\n",
     "                depth += 1\n",
     "            elif c == ']':\n",
@@ -350,12 +725,65 @@
     "                    if state == TAG: \n",
     "                        state = SEQ\n",
     "                    elif state == TAG_BEFORE:\n",
+    "                        if i < n:\n",
+    "                            cnext = sequence[i]\n",
+    "                            if cnext == '?':\n",
+    "                                unlocalized_modifications.append(current_tag.process())\n",
+    "                                i += 1\n",
+    "                            elif cnext == '-':\n",
+    "                                n_term = current_tag.process()\n",
+    "                                i += 1\n",
+    "                            else:\n",
+    "                                i += 1\n",
+    "                                raise Exception(\"Error In State {state}, unexpected {cnext} found at index {i}\".format(**locals()))\n",
+    "\n",
     "                        state = BEFORE\n",
     "                    elif state == TAG_AFTER:\n",
-    "                        c_term = process_tag_tokens(current_tag)\n",
+    "                        c_term = current_tag.process()\n",
     "                        state = DONE\n",
+    "                    elif state == GLOBAL:\n",
+    "                        # Gobble the rest of the global tag inline to avoid spawning\n",
+    "                        # a whole new state.\n",
+    "                        if i < n:\n",
+    "                            c = sequence[i]\n",
+    "                            i += 1\n",
+    "                            if c != '@':\n",
+    "                                raise Exception(\n",
+    "                                    (\"Error In State {state}, fixed modification detected without \"\n",
+    "                                    \"target amino acids found at index {i}\").format(**locals()))\n",
+    "                            end = 0\n",
+    "                            targets = []\n",
+    "                            while i < n:\n",
+    "                                c = sequence[i]\n",
+    "                                i += 1\n",
+    "                                if c in VALID_AA:\n",
+    "                                    targets.append(c)\n",
+    "                                elif c == ',':\n",
+    "                                    pass\n",
+    "                                elif '>':\n",
+    "                                    break\n",
+    "                            else:\n",
+    "                                raise Exception(\n",
+    "                                    (\"Error In State {state}, unclosed fixed modification rule\").format(**locals()))\n",
+    "\n",
+    "                        fixed_modifications.append(\n",
+    "                            ModificationRule(current_tag.process(), targets))\n",
+    "                        state = BEFORE\n",
+    "            else:\n",
+    "                current_tag.append(c)\n",
+    "        elif state == FIXED:\n",
+    "            if c == '[':\n",
+    "                state = GLOBAL\n",
     "            else:\n",
+    "                state = ISOTOPE\n",
     "                current_tag.append(c)\n",
+    "        elif state == ISOTOPE:\n",
+    "            if c != '>':\n",
+    "                current_tag.append(c)\n",
+    "            else:\n",
+    "                isotopes.append(StableIsotope(''.join(current_tag)))\n",
+    "                current_tag.reset()\n",
+    "                state = BEFORE\n",
     "        elif state == LABILE:\n",
     "            if c == '{':\n",
     "                depth += 1\n",
@@ -363,8 +791,7 @@
     "                depth -= 1\n",
     "                if depth <= 0:\n",
     "                    depth = 0\n",
-    "                    labile_modifications.append(process_tag_tokens(current_tag))\n",
-    "                    current_tag = []\n",
+    "                    labile_modifications.append(current_tag.process())\n",
     "                    state = BEFORE\n",
     "            else:\n",
     "                current_tag.append(c)\n",
@@ -375,8 +802,7 @@
     "                depth -= 1\n",
     "                if depth <= 0:\n",
     "                    depth = 0\n",
-    "                    current_interval[2] = process_tag_tokens(current_tag)\n",
-    "                    current_tag = []\n",
+    "                    current_interval.tag = current_tag.process()\n",
     "                    intervals.append(current_interval)\n",
     "                    current_interval = None\n",
     "                    state = SEQ\n",
@@ -384,50 +810,209 @@
     "                current_tag.append(c)\n",
     "        else:\n",
     "            raise Exception(\"Error In State {state}, unexpected {c} found at index {i}\".format(**locals()))\n",
+    "    if state in (ISOTOPE, TAG, TAG_AFTER, TAG_BEFORE, LABILE, ):\n",
+    "        raise Exception(\"Error In State {state}, unclosed group reached end of string!\".format(**locals()))\n",
     "    if current_aa:\n",
-    "        positions.append((current_aa, process_tag_tokens(current_tag) if current_tag else None))\n",
+    "        positions.append((current_aa, current_tag.process() if current_tag else None))\n",
     "    return positions, {\n",
     "        'n_term': n_term,\n",
     "        'c_term': c_term,\n",
     "        'unlocalized_modifications': unlocalized_modifications,\n",
     "        'labile_modifications': labile_modifications,\n",
+    "        'fixed_modifications': fixed_modifications,\n",
     "        'intervals': intervals,\n",
+    "        'isotopes': isotopes,\n",
+    "        'group_ids': list(current_tag.group_ids)\n",
     "    }"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 110,
+   "execution_count": 231,
    "metadata": {},
    "outputs": [
     {
+     "output_type": "execute_result",
      "data": {
       "text/plain": [
        "([('S', None),\n",
-       "  ('T', UnimodModification('Ox', [], None)),\n",
+       "  ('T', UnimodModification('Ox', None, None)),\n",
        "  ('E', None),\n",
        "  ('P', None),\n",
        "  ('P', None),\n",
        "  ('I', None),\n",
        "  ('N', None),\n",
        "  ('G', None)],\n",
-       " {'n_term': GenericModification('Hex', [], None),\n",
+       " {'n_term': GenericModification('Hex', None, None),\n",
+       "  'c_term': None,\n",
+       "  'unlocalized_modifications': [GenericModification('Bar', None, None)],\n",
+       "  'labile_modifications': [GenericModification('Foo', None, None)],\n",
+       "  'fixed_modifications': [ModificationRule(GenericModification('Carbamidomethyl', None, None), ['C'])],\n",
+       "  'intervals': [TaggedInterval(2, 5, 18.0000)],\n",
+       "  'isotopes': [StableIsotope(13C)],\n",
+       "  'group_ids': []})"
+      ]
+     },
+     "metadata": {},
+     "execution_count": 231
+    }
+   ],
+   "source": [
+    "seq, fields = tokenize_proforma(\"<[Carbamidomethyl]@C><13C>[Bar]?{Foo}[Hex]-ST[U:Ox](EPP)[+18]ING\")\n",
+    "seq, fields"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 232,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "([('S', None),\n",
+       "  ('E', None),\n",
+       "  ('P', None),\n",
+       "  ('P', None),\n",
+       "  ('I', None),\n",
+       "  ('N', None),\n",
+       "  ('G', None)],\n",
+       " {'n_term': None,\n",
        "  'c_term': None,\n",
        "  'unlocalized_modifications': [],\n",
-       "  'labile_modifications': [GenericModification('Foo', [], None)],\n",
-       "  'intervals': [[1, 4, MassModification(18.0, [], None)]]})"
+       "  'labile_modifications': [],\n",
+       "  'fixed_modifications': [],\n",
+       "  'intervals': [TaggedInterval(1, 4, 18.0000)],\n",
+       "  'isotopes': [],\n",
+       "  'group_ids': []})"
       ]
      },
-     "execution_count": 110,
      "metadata": {},
-     "output_type": "execute_result"
+     "execution_count": 232
     }
    ],
    "source": [
-    "seq, fields = tokenize_proforma(\"{Foo}[Hex]-ST[U:Ox](EPP)[+18]ING\")\n",
+    "seq, fields = tokenize_proforma(\"S(EPP)[+18]ING\")\n",
     "seq, fields"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 233,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "([('E', None),\n",
+       "  ('M', GenericModification('Oxidation', None, None)),\n",
+       "  ('E', None),\n",
+       "  ('V', None),\n",
+       "  ('T', LocalizationMarker(0.01, None, '#s1')),\n",
+       "  ('S', LocalizationMarker(0.09, None, '#s1')),\n",
+       "  ('E', None),\n",
+       "  ('S', LocalizationMarker(0.9, None, '#s1')),\n",
+       "  ('P', None),\n",
+       "  ('E', None),\n",
+       "  ('K', None)],\n",
+       " {'n_term': None,\n",
+       "  'c_term': None,\n",
+       "  'unlocalized_modifications': [GenericModification('Phospho', [], '#s1')],\n",
+       "  'labile_modifications': [],\n",
+       "  'fixed_modifications': [],\n",
+       "  'intervals': [],\n",
+       "  'isotopes': [],\n",
+       "  'group_ids': ['#s1']})"
+      ]
+     },
+     "metadata": {},
+     "execution_count": 233
+    }
+   ],
+   "source": [
+    "parse_proforma(\"[Phospho#s1]?EM[Oxidation]EVT[#s1(0.01)]S[#s1(0.09)]ES[#s1(0.90)]PEK\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 234,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "([('E', None),\n",
+       "  ('M', GenericModification('Oxidation', None, None)),\n",
+       "  ('E', None),\n",
+       "  ('V', None),\n",
+       "  ('T', LocalizationMarker(0.01, None, '#g1')),\n",
+       "  ('S', LocalizationMarker(0.09, None, '#g1')),\n",
+       "  ('E', None),\n",
+       "  ('S',\n",
+       "   GenericModification('Phospho', [LocalizationMarker(0.9, None, '#g1')], '#g1')),\n",
+       "  ('P', None),\n",
+       "  ('E', None),\n",
+       "  ('K', None)],\n",
+       " {'n_term': None,\n",
+       "  'c_term': None,\n",
+       "  'unlocalized_modifications': [],\n",
+       "  'labile_modifications': [],\n",
+       "  'fixed_modifications': [],\n",
+       "  'intervals': [],\n",
+       "  'isotopes': [],\n",
+       "  'group_ids': ['#g1']})"
+      ]
+     },
+     "metadata": {},
+     "execution_count": 234
+    }
+   ],
+   "source": [
+    "tokenize_proforma(\"EM[Oxidation]EVT[#g1(0.01)]S[#g1(0.09)]ES[Phospho#g1(0.90)]PEK\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 235,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "([('E', None),\n",
+       "  ('M', None),\n",
+       "  ('E', None),\n",
+       "  ('V', None),\n",
+       "  ('T', LocalizationMarker(0.01, None, '#g1')),\n",
+       "  ('S', LocalizationMarker(0.09, None, '#g1')),\n",
+       "  ('E', None),\n",
+       "  ('S',\n",
+       "   GlycanModification('HexNAc 1', [LocalizationMarker(0.9, None, '#g1')], '#g1')),\n",
+       "  ('P', None),\n",
+       "  ('E', None),\n",
+       "  ('K', None)],\n",
+       " {'n_term': None,\n",
+       "  'c_term': None,\n",
+       "  'unlocalized_modifications': [],\n",
+       "  'labile_modifications': [],\n",
+       "  'fixed_modifications': [],\n",
+       "  'intervals': [],\n",
+       "  'isotopes': [],\n",
+       "  'group_ids': ['#g1']})"
+      ]
+     },
+     "metadata": {},
+     "execution_count": 235
+    }
+   ],
+   "source": [
+    "tokenize_proforma(\"EMEVT[#g1(0.01)]S[#g1(0.09)]ES[Glycan:HexNAc 1#g1(0.90)]PEK\")"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -452,9 +1037,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.8.5-final"
   }
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}
\ No newline at end of file

From 2dbe1c819c3663ea75e21a54223bb7e128e76c3e Mon Sep 17 00:00:00 2001
From: Joshua Klein <mobiusklein@gmail.com>
Date: Wed, 30 Dec 2020 23:57:30 -0500
Subject: [PATCH 04/27] Add cv resolver

---
 .gitignore            |   3 +-
 pyteomics/proforma.py | 341 +++++++++++++++++++++++++++++++-----------
 2 files changed, 258 insertions(+), 86 deletions(-)

diff --git a/.gitignore b/.gitignore
index 382ed2c6..156bb9dd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,4 +3,5 @@
 __pycache__
 *.egg-info
 *.pyc
-.ipynb_checkpoints
\ No newline at end of file
+.ipynb_checkpoints
+.vscode
\ No newline at end of file
diff --git a/pyteomics/proforma.py b/pyteomics/proforma.py
index 26ea3355..c98d29a9 100644
--- a/pyteomics/proforma.py
+++ b/pyteomics/proforma.py
@@ -39,6 +39,16 @@
 
 from pyteomics import parser
 from pyteomics.mass import Composition
+from pyteomics.auxiliary import PyteomicsError
+from pyteomics.mass import Unimod
+
+
+class ProFormaError(PyteomicsError):
+    def __init__(self, message, index=None, parser_state=None, **kwargs):
+        super(ProFormaError, self).__init__(PyteomicsError, message, index, parser_state)
+        self.message = message
+        self.index = index
+        self.parser_state = parser_state
 
 
 class PrefixSavingMeta(type):
@@ -119,7 +129,7 @@ def __str__(self):
         else:
             label = part
         if self.group_id:
-            label = '%s%s' % (label, self.group_id)
+            label = '%s|%s' % (label, self.group_id)
         return '%s' % label
 
     def __repr__(self):
@@ -204,24 +214,113 @@ def _format_main(self):
         return '%0.4f' % self.value
 
 
+class ModificationResolver(object):
+    def __init__(self, name, *args, **kwargs):
+        self.name = name
+
+    def resolve(self, name=None, id=None, **kwargs):
+        raise NotImplementedError()
+
+    def __call__(self, name=None, id=None, **kwargs):
+        return self.resolve(name, id, **kwargs)
+
+
+class UnimodResolver(ModificationResolver):
+    def __init__(self, *args, **kwargs):
+        super(UnimodResolver, self).__init__("unimod", *args, **kwargs)
+        self._database = kwargs.get("database" )
+
+    @property
+    def database(self):
+        if not self._database:
+            self._database = Unimod()
+        return self._database
+
+    def resolve(self, name=None, id=None, **kwargs):
+        if name is not None:
+            defn = self.database.by_title(name)
+            if not defn:
+                defn = self.database.by_name(name)
+            if not defn:
+                raise KeyError(name)
+        elif id is not None:
+            defn = self.database.by_id(id)
+            if not defn:
+                raise KeyError(id)
+        else:
+            raise ValueError("Must provide one of `name` or `id`")
+        return {
+            'composition': defn['composition'],
+            'name': defn['title'],
+            'id': defn['record_id'],
+            'mass': defn['mono_mass'],
+            'provider': self.name
+        }
+
+
 class ModificationBase(TagBase):
     '''A base class for all modification tags with marked prefixes.
     '''
 
     _tag_type = None
-    __slots__ = ()
+    __slots__ = ('_definition', )
 
     def __init__(self, value, extra=None, group_id=None):
         super(ModificationBase, self).__init__(
             self._tag_type, value, extra, group_id)
+        self._definition = None
+
+    @property
+    def definition(self):
+        if self._definition is None:
+            self._definition = self.resolve()
+        return self._definition
+
+    @property
+    def mass(self):
+        return self.definition['mass']
+
+    @property
+    def composition(self):
+        return self.definition.get('composition')
+
+    @property
+    def id(self):
+        return self.definition.get('id')
+
+    @property
+    def name(self):
+        return self.definition.get('name')
+
+    @property
+    def provider(self):
+        return self.definition.get('provider')
+
+    def _populate_from_definition(self, definition):
+        self._definition = definition
 
     def _format_main(self):
         return "{self.prefix_name}:{self.value}".format(self=self)
 
+    def _parse_identifier(self):
+        tokens = self.value.split(":", 1)
+        if len(tokens) > 1:
+            value = tokens[1]
+        else:
+            value = self.value
+        if value.isdigit():
+            id = int(value)
+            name = None
+        else:
+            name = value
+            id = None
+        return name, id
+
     def resolve(self):
         '''Find the term and return it's properties
         '''
-        raise NotImplementedError()
+        keys = self._parse_identifier()
+        return self.resolver(*keys)
 
 
 class FormulaModification(ModificationBase):
@@ -236,7 +335,8 @@ def resolve(self):
         composition = Composition(formula=''.join(self.value.split(" ")))
         return {
             "mass": composition.mass(),
-            "composition": composition
+            "composition": composition,
+            "name": self.value
         }
 
 
@@ -246,26 +346,11 @@ class GlycanModification(ModificationBase):
     _tag_type = TagTypeEnum.glycan
 
 
-class GenericModification(TagBase):
-    __slots__ = ()
-
-    def __init__(self, value, extra=None, group_id=None):
-        super(GenericModification, self).__init__(
-            TagTypeEnum.generic, value, extra, group_id)
-
-    def _format_main(self):
-        return self.value
-
-    def resolve(self):
-        '''Find the term, searching through all available vocabularies and
-        return the first match's properties
-        '''
-        raise NotImplementedError()
-
-
 class UnimodModification(ModificationBase):
     __slots__ = ()
 
+    resolver = UnimodResolver()
+
     prefix_name = "UNIMOD"
     short_prefix = "U"
     _tag_type = TagTypeEnum.unimod
@@ -295,6 +380,33 @@ class XLMODModification(ModificationBase):
     _tag_type = TagTypeEnum.xlmod
 
 
+class GenericModification(ModificationBase):
+    __slots__ = ()
+    _tag_type = TagTypeEnum.generic
+
+    def __init__(self, value, extra=None, group_id=None):
+        super(GenericModification, self).__init__(
+            value, extra, group_id)
+
+    def _format_main(self):
+        return self.value
+
+    def resolve(self):
+        '''Find the term, searching through all available vocabularies and
+        return the first match's properties
+        '''
+        keys = self._parse_identifier()
+        defn = None
+        try:
+            defn = UnimodModification.resolver(*keys)
+        except KeyError:
+            pass
+        if defn is not None:
+            return defn
+        raise KeyError(keys)
+
+
+
 def split_tags(tokens):
     '''Split a token array into discrete sets of tag
     tokens.
@@ -524,9 +636,9 @@ def __repr__(self):
         return "{self.__class__.__name__}({self.start}, {self.end}, {self.tag})".format(self=self)
 
 
-class TagParser(object):
-    '''A parser which accumulates tokens until it is asked to parse them into
-    :class:`TagBase` instances.
+class TokenBuffer(object):
+    '''A token buffer that wraps the accumulation and reset logic
+    of a list of :class:`str` objects.
 
     Implements a subset of the Sequence protocol.
 
@@ -534,19 +646,9 @@ class TagParser(object):
     ----------
     buffer: list
         The list of tokens accumulated since the last parsing.
-    group_ids: set
-        The set of all group IDs that have been produced so far.
     '''
-
-    def __init__(self, initial=None, group_ids=None):
-        if initial:
-            self.buffer = list(initial)
-        else:
-            self.buffer = []
-        if group_ids:
-            self.group_ids = set(group_ids)
-        else:
-            self.group_ids = set()
+    def __init__(self, initial=None):
+        self.buffer = list(initial or [])
 
     def append(self, c):
         '''Append a new character to the buffer.
@@ -575,6 +677,53 @@ def __getitem__(self, i):
     def __len__(self):
         return len(self.buffer)
 
+    def process(self):
+        value = self.buffer
+        self.reset()
+        return value
+
+    def __call__(self):
+        return self.process()
+
+
+class NumberParser(TokenBuffer):
+    '''A buffer which accumulates tokens until it is asked to parse them into
+    :class:`int` instances.
+
+    Implements a subset of the Sequence protocol.
+
+    Attributes
+    ----------
+    buffer: list
+        The list of tokens accumulated since the last parsing.
+    '''
+    def process(self):
+        value = int(''.join(self))
+        self.reset()
+        return  value
+
+
+class TagParser(TokenBuffer):
+    '''A buffer which accumulates tokens until it is asked to parse them into
+    :class:`TagBase` instances.
+
+    Implements a subset of the Sequence protocol.
+
+    Attributes
+    ----------
+    buffer: list
+        The list of tokens accumulated since the last parsing.
+    group_ids: set
+        The set of all group IDs that have been produced so far.
+    '''
+
+    def __init__(self, initial=None, group_ids=None):
+        super(TagParser, self).__init__(initial)
+        if group_ids:
+            self.group_ids = set(group_ids)
+        else:
+            self.group_ids = set()
+
     def process(self):
         '''Parse the content of the internal buffer, clear the buffer,
         and return the parsed tag.
@@ -601,7 +750,10 @@ class ParserStateEnum(Enum):
     interval_tag = 7
     tag_after_sequence = 8
     stable_isotope = 9
-
+    post_tag_before = 10
+    unlocalized_count = 11
+    post_global = 12
+    post_global_aa = 13
     done = 999
 
 
@@ -615,6 +767,10 @@ class ParserStateEnum(Enum):
 TAG = ParserStateEnum.tag_in_sequence
 INTERVAL_TAG = ParserStateEnum.interval_tag
 TAG_AFTER = ParserStateEnum.tag_after_sequence
+POST_TAG_BEFORE = ParserStateEnum.post_tag_before
+UNLOCALIZED_COUNT = ParserStateEnum.unlocalized_count
+POST_GLOBAL = ParserStateEnum.post_global
+POST_GLOBAL_AA = ParserStateEnum.post_global_aa
 DONE = ParserStateEnum.done
 
 VALID_AA = set("QWERTYIPASDFGHKLCVNM")
@@ -659,6 +815,8 @@ def parse_proforma(sequence):
     current_aa = None
     current_tag = TagParser()
     current_interval = None
+    current_unlocalized_count = NumberParser()
+    current_aa_targets = TokenBuffer()
 
     while i < n:
         c = sequence[i]
@@ -676,7 +834,8 @@ def parse_proforma(sequence):
                 current_aa = c
                 state = SEQ
             else:
-                raise Exception("Error In State {state}, unexpected {c} found at index {i}".format(**locals()))
+                raise ProFormaError(
+                    "Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state)
         elif state == SEQ:
             if c in VALID_AA:
                 positions.append((current_aa, current_tag.process() if current_tag else None))
@@ -688,22 +847,22 @@ def parse_proforma(sequence):
                 current_interval = TaggedInterval(len(positions) + 1)
             elif c == ')':
                 if current_interval is None:
-                    raise Exception("Error In State {state}, unexpected {c} found at index {i}".format(**locals()))
+                    raise ProFormaError("Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state)
                 else:
                     current_interval.end = len(positions) + 1
                     if i >= n or sequence[i] != '[':
-                        raise Exception("Missing Interval Tag")
+                        raise ProFormaError("Missing Interval Tag", i, state)
                     i += 1
                     depth = 1
                     state = INTERVAL_TAG
             elif c == '-':
                 state = TAG_AFTER
                 if i >= n or sequence[i] != '[':
-                    raise Exception("Missing Interval Tag")
+                    raise ProFormaError("Missing Interval Tag", i, state)
                 i += 1
                 depth = 1
             else:
-                raise Exception("Error In State {state}, unexpected {c} found at index {i}".format(**locals()))
+                raise ProFormaError("Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state)
         elif state == TAG or state == TAG_BEFORE or state == TAG_AFTER or state == GLOBAL:
             if c == '[':
                 depth += 1
@@ -714,62 +873,26 @@ def parse_proforma(sequence):
                     if state == TAG:
                         state = SEQ
                     elif state == TAG_BEFORE:
-                        if i < n:
-                            cnext = sequence[i]
-                            if cnext == '?':
-                                unlocalized_modifications.append(current_tag.process())
-                                i += 1
-                            elif cnext == '-':
-                                n_term = current_tag.process()
-                                i += 1
-                            else:
-                                i += 1
-                                raise Exception("Error In State {state}, unexpected {cnext} found at index {i}".format(**locals()))
-
-                        state = BEFORE
+                        state = POST_TAG_BEFORE
                     elif state == TAG_AFTER:
                         c_term = current_tag.process()
                         state = DONE
                     elif state == GLOBAL:
-                        # Gobble the rest of the global tag inline to avoid spawning
-                        # a whole new state.
-                        if i < n:
-                            c = sequence[i]
-                            i += 1
-                            if c != '@':
-                                raise Exception(
-                                    ("Error In State {state}, fixed modification detected without "
-                                    "target amino acids found at index {i}").format(**locals()))
-                            end = 0
-                            targets = []
-                            while i < n:
-                                c = sequence[i]
-                                i += 1
-                                if c in VALID_AA:
-                                    targets.append(c)
-                                elif c == ',':
-                                    pass
-                                elif '>':
-                                    break
-                            else:
-                                raise Exception(
-                                    ("Error In State {state}, unclosed fixed modification rule").format(**locals()))
-
-                        fixed_modifications.append(
-                            ModificationRule(current_tag.process(), targets))
-                        state = BEFORE
+                        state = POST_GLOBAL
             else:
                 current_tag.append(c)
         elif state == FIXED:
             if c == '[':
                 state = GLOBAL
             else:
+                # Do validation here
                 state = ISOTOPE
                 current_tag.append(c)
         elif state == ISOTOPE:
             if c != '>':
                 current_tag.append(c)
             else:
+                # Not technically a tag, but exploits the current buffer
                 isotopes.append(StableIsotope(''.join(current_tag)))
                 current_tag.reset()
                 state = BEFORE
@@ -797,10 +920,58 @@ def parse_proforma(sequence):
                     state = SEQ
             else:
                 current_tag.append(c)
+        elif state == POST_TAG_BEFORE:
+            if c == '?':
+                unlocalized_modifications.append(current_tag.process())
+                state = BEFORE
+            elif c == '-':
+                n_term = current_tag.process()
+                state = BEFORE
+            elif c == '^':
+                state = UNLOCALIZED_COUNT
+            else:
+                raise ProFormaError(
+                    "Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state)
+        elif state == UNLOCALIZED_COUNT:
+            if c.isdigit():
+                current_unlocalized_count.append(c)
+            elif c == '[':
+                state = TAG_BEFORE
+                depth = 1
+                tag = current_tag.process()
+                multiplicity = current_unlocalized_count.process()
+                for i in range(multiplicity):
+                    unlocalized_modifications.append(tag)
+            elif c == '?':
+                state = BEFORE
+                tag = current_tag.process()
+                multiplicity = current_unlocalized_count.process()
+                for i in range(multiplicity):
+                    unlocalized_modifications.append(tag)
+            else:
+                raise ProFormaError(
+                    "Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state)
+        elif state == POST_GLOBAL:
+            if c == '@':
+                state = POST_GLOBAL_AA
+            else:
+                raise ProFormaError(
+                    ("Error In State {state}, fixed modification detected without "
+                     "target amino acids found at index {i}").format(**locals()), i, state)
+        elif state == POST_GLOBAL_AA:
+            if c in VALID_AA:
+                current_aa_targets.append(c)
+            elif c == '>':
+                fixed_modifications.append(
+                    ModificationRule(current_tag.process(), current_aa_targets.process()))
+                state = BEFORE
+            else:
+                raise ProFormaError(
+                    ("Error In State {state}, unclosed fixed modification rule").format(**locals()), i, state)
         else:
-            raise Exception("Error In State {state}, unexpected {c} found at index {i}".format(**locals()))
+            raise ProFormaError("Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state)
     if state in (ISOTOPE, TAG, TAG_AFTER, TAG_BEFORE, LABILE, ):
-        raise Exception("Error In State {state}, unclosed group reached end of string!".format(**locals()))
+        raise ProFormaError("Error In State {state}, unclosed group reached end of string!".format(**locals()), i, state)
     if current_aa:
         positions.append((current_aa, current_tag.process() if current_tag else None))
     return positions, {
@@ -811,5 +982,5 @@ def parse_proforma(sequence):
         'fixed_modifications': fixed_modifications,
         'intervals': intervals,
         'isotopes': isotopes,
-        'group_ids': list(current_tag.group_ids)
+        'group_ids': sorted(current_tag.group_ids)
     }

From 0b79ccc761fe52123abf6fae9a0d0bdecb084ef4 Mon Sep 17 00:00:00 2001
From: Joshua Klein <mobiusklein@gmail.com>
Date: Thu, 7 Jan 2021 08:51:32 -0500
Subject: [PATCH 05/27] Clean up markers

---
 pyteomics/proforma.py | 302 +++++++++++++++++++++++++++++++++++++++---
 1 file changed, 281 insertions(+), 21 deletions(-)

diff --git a/pyteomics/proforma.py b/pyteomics/proforma.py
index c98d29a9..6b4b39c1 100644
--- a/pyteomics/proforma.py
+++ b/pyteomics/proforma.py
@@ -27,7 +27,8 @@
 '''
 
 import re
-from collections import namedtuple, defaultdict
+from collections import namedtuple, defaultdict, deque
+from functools import partial
 
 try:
     from enum import Enum
@@ -38,10 +39,22 @@
 from six import add_metaclass
 
 from pyteomics import parser
-from pyteomics.mass import Composition
-from pyteomics.auxiliary import PyteomicsError
+from pyteomics.mass import Composition, std_aa_mass
+from pyteomics.auxiliary import PyteomicsError, BasicComposition
 from pyteomics.mass import Unimod
 
+# To eventually be implemented with pyteomics port?
+try:
+    from psims.controlled_vocabulary.controlled_vocabulary import (load_psimod, load_xlmod, load_gno, obo_cache)
+except ImportError:
+    def _needs_psims(name):
+        raise ImportError("Loading %s requires the `psims` library. To access it, please install `psims" % name)
+
+    load_psimod = partial(_needs_psims, 'PSIMOD')
+    load_xlmod = partial(_needs_psims, 'XLMOD')
+    load_gno = partial(_needs_psims, 'GNO')
+    obo_cache = None
+
 
 class ProFormaError(PyteomicsError):
     def __init__(self, message, index=None, parser_state=None, **kwargs):
@@ -155,7 +168,20 @@ def find_extra(self, label):
         return out
 
 
-class PositionLabelTag(TagBase):
+class GroupLabelBase(TagBase):
+    __slots__ = ()
+
+    def __str__(self):
+        part = self._format_main()
+        if self.extra:
+            rest = [str(e) for e in self.extra]
+            label = '|'.join([part] + rest)
+        else:
+            label = part
+        return '%s' % label
+
+
+class PositionLabelTag(GroupLabelBase):
     '''A tag to mark that a position is involved in a group in some way, but does
     not imply any specific semantics.
     '''
@@ -167,10 +193,10 @@ def __init__(self, value=None, extra=None, group_id=None):
             TagTypeEnum.position_label, group_id, extra, group_id)
 
     def _format_main(self):
-        return "#{self.group_id}".format(self=self)
+        return "{self.group_id}".format(self=self)
 
 
-class LocalizationMarker(TagBase):
+class LocalizationMarker(GroupLabelBase):
     '''A tag to mark a particular localization site
     '''
     __slots__ = ()
@@ -181,7 +207,7 @@ def __init__(self, value, extra=None, group_id=None):
             TagTypeEnum.localization_marker, float(value), extra, group_id)
 
     def _format_main(self):
-        return "#{self.group_id}({self.value!f})".format(self=self)
+        return "{self.group_id}({self.value:.4g})".format(self=self)
 
 
 class InformationTag(TagBase):
@@ -211,12 +237,29 @@ def __init__(self, value, extra=None, group_id=None):
             TagTypeEnum.massmod, float(value), extra, group_id)
 
     def _format_main(self):
-        return '%0.4f' % self.value
+        if self.value >= 0:
+            return ('+%0.4g' % self.value).rstrip('0').rstrip('.')
+        else:
+            return ('%0.4g' % self.value).rstrip('0').rstrip('.')
+
+    @property
+    def mass(self):
+        return self.value
 
 
 class ModificationResolver(object):
     def __init__(self, name, *args, **kwargs):
         self.name = name
+        self._database = None
+
+    def load_database(self):
+        raise NotImplementedError()
+
+    @property
+    def database(self):
+        if not self._database:
+            self._database = self.load_database()
+        return self._database
 
     def resolve(self, name=None, id=None, **kwargs):
         raise NotImplementedError()
@@ -228,13 +271,10 @@ def __call__(self, name=None, id=None, **kwargs):
 class UnimodResolver(ModificationResolver):
     def __init__(self, *args, **kwargs):
         super(UnimodResolver, self).__init__("unimod", *args, **kwargs)
-        self._database = kwargs.get("database" )
+        self._database = kwargs.get("database")
 
-    @property
-    def database(self):
-        if not self._database:
-            self._database = Unimod()
-        return self._database
+    def load_database(self):
+        return Unimod()
 
     def resolve(self, name=None, id=None, **kwargs):
         if name is not None:
@@ -258,6 +298,71 @@ def resolve(self, name=None, id=None, **kwargs):
         }
 
 
+class PSIModResolver(ModificationResolver):
+    def __init__(self, *args, **kwargs):
+        super(PSIModResolver, self).__init__('psimod', *args, **kwargs)
+        self._database = kwargs.get("database")
+
+    def load_database(self):
+        return load_psimod()
+
+    def resolve(self, name=None, id=None, **kwargs):
+        if name is None:
+            defn = self.database[name]
+        elif id is None:
+            defn = self.database['MOD:{:05d}'.format(id)]
+        else:
+            raise ValueError("Must provide one of `name` or `id`")
+        mass = float(defn.DiffMono.strip()[1:-1])
+        composition = Composition(defn.DiffFormula.strip()[1:-1].replace(" ", ''))
+        return {
+            'mass': mass,
+            'composition': composition,
+            'name': defn.name,
+            'id': defn.id,
+            'provider': self.name
+        }
+
+
+class XLMODResolver(ModificationResolver):
+    def __init__(self, *args, **kwargs):
+        super(XLMODResolver, self).__init__('xlmod', *args, **kwargs)
+        self._database = kwargs.get("database")
+
+    def load_database(self):
+        return load_psimod()
+
+    def resolve(self, name=None, id=None, **kwargs):
+        if name is None:
+            defn = self.database[name]
+        elif id is None:
+            defn = self.database['XLMOD:{:05d}'.format(id)]
+        else:
+            raise ValueError("Must provide one of `name` or `id`")
+        mass = float(defn['monoIsotopicMass'])
+        if 'deadEndFormula' in defn:
+            composition = Composition(defn['deadEndFormula'].replace(" ", '').replace("D", "H[2]"))
+        elif 'bridgeFormula' in defn:
+            composition = Composition(
+                defn['bridgeFormula'].replace(" ", '').replace("D", "H[2]"))
+        return {
+            'mass': mass,
+            'composition': composition,
+            'name': defn.name,
+            'id': defn.id,
+            'provider': self.name
+        }
+
+
+class GNOResolver(ModificationResolver):
+    def __init__(self, *args, **kwargs):
+        super(GNOResolver, self).__init__('gnome', *args, **kwargs)
+        self._database = kwargs.get("database")
+
+    def load_database(self):
+        return load_gno()
+
+
 class ModificationBase(TagBase):
     '''A base class for all modification tags with marked prefixes.
     '''
@@ -331,7 +436,6 @@ class FormulaModification(ModificationBase):
     def resolve(self):
         # The handling of fixed isotopes is wrong here as Pyteomics uses a different
         # convention.
-        from pyteomics.mass import Composition
         composition = Composition(formula=''.join(self.value.split(" ")))
         return {
             "mass": composition.mass(),
@@ -345,6 +449,44 @@ class GlycanModification(ModificationBase):
 
     _tag_type = TagTypeEnum.glycan
 
+    valid_monosaccharides = {
+        "Hex": (162.0528, Composition("C6H10O5")),
+        "HexNAc": (203.0793, Composition("C6H13N1O5")),
+        "HexS": (242.009, Composition("C8H10O8S1")),
+        "HexP": (242.0191, Composition("C6H11O8P1")),
+        "HexNAcS": (283.0361, Composition("C8H13N1O8S1")),
+        "dHex": (146.0579, Composition("C6H10O4")),
+        "NeuAc": (291.0954, Composition("C11H17N1O8")),
+        "NeuGc": (307.0903, Composition("C11H17N1O9")),
+        "Pen": (132.0422, Composition("C5H8O4")),
+        "Fuc": (146.0579, Composition("C6H10O4"))
+    }
+
+    tokenizer = re.compile(r"([A-Za-z]+)\s*(\d*)\s*")
+
+    def resolve(self):
+        composite = BasicComposition()
+        for tok, cnt in self.tokenizer.findall(self.value):
+            if cnt:
+                cnt = int(cnt)
+            else:
+                cnt = 1
+            if tok not in self.valid_monosaccharides:
+                raise ValueError(f"{tok!r} is not a valid monosaccharide name")
+            composite[tok] += cnt
+        mass = 0
+        chemcomp = Composition()
+        for key, cnt in composite.items():
+            m, c = self.valid_monosaccharides[key]
+            mass += m * cnt
+            chemcomp += c * cnt
+        return {
+            "mass": mass,
+            "composition": chemcomp,
+            "name": self.value,
+            "monosaccharides": composite
+        }
+
 
 class UnimodModification(ModificationBase):
     __slots__ = ()
@@ -359,6 +501,8 @@ class UnimodModification(ModificationBase):
 class PSIModModification(ModificationBase):
     __slots__ = ()
 
+    resolver = PSIModResolver()
+
     prefix_name = "MOD"
     short_prefix = 'M'
     _tag_type = TagTypeEnum.psimod
@@ -375,6 +519,8 @@ class GNOmeModification(ModificationBase):
 class XLMODModification(ModificationBase):
     __slots__ = ()
 
+    resolver = XLMODResolver()
+
     prefix_name = "XLMOD"
     # short_prefix = 'XL'
     _tag_type = TagTypeEnum.xlmod
@@ -406,7 +552,6 @@ def resolve(self):
         raise KeyError(keys)
 
 
-
 def split_tags(tokens):
     '''Split a token array into discrete sets of tag
     tokens.
@@ -434,7 +579,15 @@ def split_tags(tokens):
     out = []
     for i, start in enumerate(starts):
         end = ends[i]
-        out.append(tokens[start:end])
+        tag = tokens[start:end]
+        if len(tag) == 0:
+            continue
+        # Short circuit on INFO tags which can't be broken
+        # if (tag[0] == 'i' and tag[:5] == ['i', 'n', 'f', 'o', ':']) or (tag[0] == 'I' and tag[:5] == ['I', 'N', 'F', 'O', ':']):
+        #     tag = tokens[start:]
+        #     out.append(tag)
+        #     break
+        out.append(tag)
     return out
 
 
@@ -566,7 +719,7 @@ def __ne__(self, other):
 
     def __str__(self):
         targets = ','.join(self.targets)
-        return "<{self.modification_tag}@{targets}>".format(self=self, targets=targets)
+        return "<[{self.modification_tag}]@{targets}>".format(self=self, targets=targets)
 
     def __repr__(self):
         return "{self.__class__.__name__}({self.modification_tag!r}, {self.targets})".format(self=self)
@@ -790,7 +943,7 @@ def parse_proforma(sequence):
 
     Returns
     -------
-    parsed_sequence: list
+    parsed_sequence: list[tuple[str, TagBase]]
         The (amino acid: str, TagBase or None) pairs denoting the positions along the primary sequence
     modifiers: dict
         A mapping listing the labile modifications, fixed modifications, stable isotopes, unlocalized
@@ -970,10 +1123,10 @@ def parse_proforma(sequence):
                     ("Error In State {state}, unclosed fixed modification rule").format(**locals()), i, state)
         else:
             raise ProFormaError("Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state)
-    if state in (ISOTOPE, TAG, TAG_AFTER, TAG_BEFORE, LABILE, ):
-        raise ProFormaError("Error In State {state}, unclosed group reached end of string!".format(**locals()), i, state)
     if current_aa:
         positions.append((current_aa, current_tag.process() if current_tag else None))
+    if state in (ISOTOPE, TAG, TAG_AFTER, TAG_BEFORE, LABILE, ):
+        raise ProFormaError("Error In State {state}, unclosed group reached end of string!".format(**locals()), i, state)
     return positions, {
         'n_term': n_term,
         'c_term': c_term,
@@ -984,3 +1137,110 @@ def parse_proforma(sequence):
         'isotopes': isotopes,
         'group_ids': sorted(current_tag.group_ids)
     }
+
+
+def to_proforma(sequence, n_term=None, c_term=None, unlocalized_modifications=None,
+                labile_modifications=None, fixed_modifications=None, intervals=None,
+                isotopes=None, group_ids=None):
+    '''Convert a sequence plus modifiers into formatted text following the
+    ProForma specification.
+
+    Parameters
+    ----------
+    sequence : list[tuple[str, TagBase]]
+        The primary sequence of the peptidoform/proteoform to render
+    n_term : Optional[TagBase]
+        The N-terminal modification, if any.
+    c_term : Optional[TagBase]
+        The C-terminal modification, if any.
+    unlocalized_modifications : Optional[list[TagBase]]
+        Any modifications which aren't assigned to a specific location.
+    labile_modifications : Optional[list[TagBase]]
+        Any labile modifications
+    fixed_modifications : Optional[list[ModificationRule]]
+        Any fixed modifications
+    intervals : Optional[list[TaggedInterval]]
+        A list of modified intervals, if any
+    isotopes : Optional[list[StableIsotope]]
+        Any global stable isotope labels applied
+    group_ids : Optional[list[str]]
+        Any group identifiers. This parameter is currently not used.
+
+    Returns
+    -------
+    str
+    '''
+    primary = deque(['{0!s}[{1!s}]'.format(*p) if p[1] else p[0] for p in sequence])
+    if intervals:
+        for iv in sorted(intervals, key=lambda x: x.start):
+            primary[iv.start] = '(' + primary[iv.start]
+            primary[iv.end - 1] = '{0!s})[{1!s}]'.format(primary[iv.end - 1], iv.tag)
+    if n_term:
+        primary.appendleft("[{!s}]-".format(n_term))
+    if c_term:
+        primary.append('-[{!s}]'.format(c_term))
+    if labile_modifications:
+        primary.extendleft(['{{{!s}}}'.format(m) for m in labile_modifications])
+    if unlocalized_modifications:
+        primary.appendleft("?")
+        primary.extendleft(['[{!s}]'.format(m) for m in unlocalized_modifications])
+    if isotopes:
+        primary.extendleft(['{!s}'.format(m) for m in isotopes])
+    if fixed_modifications:
+        primary.extendleft(['{!s}'.format(m) for m in fixed_modifications])
+    return ''.join(primary)
+
+
+class ProForma(object):
+    def __init__(self, sequence, properties):
+        self.sequence = sequence
+        self.properties = properties
+
+    def __str__(self):
+        return to_proforma(self.sequence, **self.properties)
+
+    def __repr__(self):
+        return "{self.__class__.__name__}({self.sequence}, {self.properties})".format(self=self)
+
+    def __getitem__(self, i):
+        if isinstance(i, slice):
+            return self.__class__(self.sequence[i], self.properties)
+        else:
+            return self.sequence[i]
+
+    @classmethod
+    def parse(cls, string):
+        return cls(*parse_proforma(string))
+
+    @property
+    def mass(self):
+        mass = 0.0
+
+        fixed_modifications = self.properties['fixed_modifications']
+        fixed_rules = {}
+        for rule in fixed_modifications:
+            for aa in rule.targets:
+                fixed_rules[aa] = rule.modification_tag.mass
+
+        for position in self.sequence:
+            aa = position[0]
+            mass += std_aa_mass[aa]
+            if aa in fixed_rules:
+                mass += fixed_rules[aa]
+            tag = position[1]
+            if tag:
+                try:
+                    mass += tag.mass
+                except (AttributeError, KeyError):
+                    continue
+        for mod in self.properties['labile_modifications']:
+            mass += mod.mass
+        for mod in self.properties['unlocalized_modifications']:
+            mass += mod.mass
+        for iv in self.properties['intervals']:
+            try:
+                mass += iv.tag.mass
+            except (AttributeError, KeyError):
+                continue
+        return mass
+

From d069380e077099f4869d8664eb411c076449d122 Mon Sep 17 00:00:00 2001
From: Joshua Klein <mobiusklein@gmail.com>
Date: Wed, 13 Jan 2021 20:24:03 -0500
Subject: [PATCH 06/27] More proforma parsing experiments

---
 proforma_parsing.ipynb | 906 ++++++-----------------------------------
 1 file changed, 121 insertions(+), 785 deletions(-)

diff --git a/proforma_parsing.ipynb b/proforma_parsing.ipynb
index 6ca08254..dfca8348 100644
--- a/proforma_parsing.ipynb
+++ b/proforma_parsing.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 172,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -22,813 +22,105 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 173,
+   "execution_count": 40,
    "metadata": {},
    "outputs": [],
    "source": [
-    "class PrefixSavingMeta(type):\n",
-    "    '''A subclass-registering-metaclass that provides easy\n",
-    "    lookup of subclasses by prefix attributes.\n",
-    "    '''\n",
+    "import importlib\n",
+    "from pyteomics import proforma\n",
+    "importlib.reload(proforma)\n",
     "\n",
-    "    def __new__(mcs, name, parents, attrs):\n",
-    "        new_type = type.__new__(mcs, name, parents, attrs)\n",
-    "        prefix = attrs.get(\"prefix_name\")\n",
-    "        if prefix:\n",
-    "            new_type.prefix_map[prefix.lower()] = new_type\n",
-    "        short = attrs.get(\"short_prefix\")\n",
-    "        if short:\n",
-    "            new_type.prefix_map[short.lower()] = new_type\n",
-    "        return new_type\n",
-    "    \n",
-    "    def find_by_tag(self, tag_name):\n",
-    "        if tag_name is None:\n",
-    "            raise ValueError(\"tag_name cannot be None!\")\n",
-    "        tag_name = tag_name.lower()\n",
-    "        return self.prefix_map[tag_name]"
+    "tokenize_proforma = proforma.parse_proforma\n",
+    "format_proforma = proforma.to_proforma"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 227,
+   "execution_count": 41,
    "metadata": {},
    "outputs": [],
    "source": [
-    "class TagTypeEnum(Enum):\n",
-    "    unimod = 0\n",
-    "    psimod = 1\n",
-    "    massmod = 2\n",
-    "    generic = 3\n",
-    "    info = 4\n",
-    "    gnome = 5\n",
-    "    xlmod = 6\n",
-    "\n",
-    "    formula = 7\n",
-    "    glycan = 8\n",
-    "\n",
-    "    localization_marker = 9\n",
-    "    position_label = 10\n",
-    "    group_placeholder = 999\n",
-    "\n",
-    "\n",
-    "_sentinel = object()\n",
-    "\n",
-    "\n",
-    "@add_metaclass(PrefixSavingMeta)\n",
-    "class TagBase(object):\n",
-    "    '''A base class for all tag types.\n",
-    "\n",
-    "    Attributes\n",
-    "    ----------\n",
-    "    type: Enum\n",
-    "        An element of :class:`TagTypeEnum` saying what kind of tag this is.\n",
-    "    value: object\n",
-    "        The data stored in this tag, usually an externally controlled name\n",
-    "    extra: list\n",
-    "        Any extra tags that were nested within this tag. Usually limited to INFO\n",
-    "        tags but may be other synonymous controlled vocabulary terms.\n",
-    "    group_id: str or None\n",
-    "        A short label denoting which group, if any, this tag belongs to\n",
-    "    '''\n",
-    "    __slots__ = (\"type\", \"value\", \"extra\", \"group_id\")\n",
-    "\n",
-    "    prefix_name = None\n",
-    "    short_prefix = None\n",
-    "    prefix_map = {}\n",
-    "    \n",
-    "    def __init__(self, type, value, extra=None, group_id=None):\n",
-    "        self.type = type\n",
-    "        self.value = value\n",
-    "        self.extra = extra\n",
-    "        self.group_id = group_id\n",
-    "\n",
-    "    def __str__(self):\n",
-    "        part = self._format_main()\n",
-    "        if self.extra:\n",
-    "            rest = [str(e) for e in self.extra]\n",
-    "            label = '|'.join([part] + rest)\n",
-    "        else:\n",
-    "            label = part\n",
-    "        if self.group_id:\n",
-    "            label = '%s%s' % (label, self.group_id)\n",
-    "        return '%s' % label\n",
-    "    \n",
-    "    def __repr__(self):\n",
-    "        template = \"{self.__class__.__name__}({self.value!r}, {self.extra!r}, {self.group_id!r})\"\n",
-    "        return template.format(self=self)\n",
-    "    \n",
-    "    def __eq__(self, other):\n",
-    "        if other is None:\n",
-    "            return False\n",
-    "        return (self.type == other.type) and (self.value == other.value) and (self.extra == other.extra) \\\n",
-    "            and (self.group_id == other.group_id)\n",
-    "\n",
-    "    def __ne__(self, other):\n",
-    "        return not self == other\n",
-    "\n",
-    "    def find_extra(self, label):\n",
-    "        out = []\n",
-    "        if not self.extra:\n",
-    "            return out\n",
-    "        for e in self.extra:\n",
-    "            if e.type == label:\n",
-    "                out.append(e)\n",
-    "        return out\n",
-    "\n",
-    "\n",
-    "class PositionLabelTag(TagBase):\n",
-    "    '''A tag to mark that a position is involved in a group in some way, but does\n",
-    "    not imply any specific semantics.\n",
-    "    '''\n",
-    "    __slots__ = ()\n",
-    "\n",
-    "    def __init__(self, value=None, extra=None, group_id=None):\n",
-    "        assert group_id is not None\n",
-    "        super(PositionLabelTag, self).__init__(TagTypeEnum.position_label, group_id, extra, group_id)\n",
-    "    \n",
-    "    def _format_main(self):\n",
-    "        return \"#{self.group_id}\".format(self=self)\n",
-    "\n",
-    "\n",
-    "class LocalizationMarker(TagBase):\n",
-    "    '''A tag to mark a particular localization site \n",
-    "    '''\n",
-    "    __slots__ = ()\n",
-    "    \n",
-    "    def __init__(self, value, extra=None, group_id=None):\n",
-    "        assert group_id is not None\n",
-    "        super(LocalizationMarker, self).__init__(TagTypeEnum.localization_marker, float(value), extra, group_id)\n",
-    "    \n",
-    "    def _format_main(self):\n",
-    "        return \"#{self.group_id}({self.value!f})\".format(self=self)\n",
-    "\n",
-    "\n",
-    "class InformationTag(TagBase):\n",
-    "    '''A tag carrying free text describing the location\n",
-    "    '''\n",
-    "    __slots__ = ()\n",
-    "\n",
-    "    prefix_name = \"INFO\"\n",
-    "\n",
-    "    def __init__(self, value, extra=None, group_id=None):\n",
-    "        super(InformationTag, self).__init__(TagTypeEnum.info, str(value), extra, group_id)\n",
-    "\n",
-    "    def _format_main(self):\n",
-    "        return str(self.value)\n",
-    "\n",
-    "\n",
-    "class MassModification(TagBase):\n",
-    "    '''A modification defined purely by a signed mass shift in Daltons.\n",
-    "\n",
-    "    The value of a :class:`MassModification` is always a :class:`float`\n",
-    "    '''\n",
-    "    __slots__ = ()\n",
-    "    \n",
-    "    def __init__(self, value, extra=None, group_id=None):\n",
-    "        super(MassModification, self).__init__(TagTypeEnum.massmod, float(value), extra, group_id)\n",
-    "    \n",
-    "    def _format_main(self):\n",
-    "        return '%0.4f' % self.value\n",
-    "\n",
-    "\n",
-    "    \n",
-    "class ModificationBase(TagBase):\n",
-    "    '''A base class for all modification tags with marked prefixes.\n",
-    "    '''\n",
-    "\n",
-    "    _tag_type = None\n",
-    "    __slots__ = ()\n",
-    "    \n",
-    "    def __init__(self, value, extra=None, group_id=None):\n",
-    "        super(ModificationBase, self).__init__(\n",
-    "            self._tag_type, value, extra, group_id)\n",
-    "\n",
-    "    def _format_main(self):\n",
-    "        return \"{self.prefix_name}:{self.value}\".format(self=self)\n",
-    "    \n",
-    "    def resolve(self):\n",
-    "        '''Find the term and return it's properties\n",
-    "        '''\n",
-    "        raise NotImplementedError()\n",
-    "\n",
-    "\n",
-    "class FormulaModification(ModificationBase):\n",
-    "    prefix_name = \"Formula\"\n",
-    "\n",
-    "    _tag_type = TagTypeEnum.formula\n",
-    "\n",
-    "    def resolve(self):\n",
-    "        # The handling of fixed isotopes is wrong here as Pyteomics uses a different\n",
-    "        # convention.\n",
-    "        from pyteomics.mass import Composition\n",
-    "        composition = Composition(formula=''.join(self.value.split(\" \")))\n",
-    "        return {\n",
-    "            \"mass\": composition.mass(),\n",
-    "            \"composition\": composition\n",
-    "        }\n",
-    "\n",
-    "\n",
-    "class GlycanModification(ModificationBase):\n",
-    "    prefix_name = \"Glycan\"\n",
-    "\n",
-    "    _tag_type = TagTypeEnum.glycan\n",
-    "\n",
-    "    \n",
-    "class GenericModification(TagBase):\n",
-    "    __slots__ = ()\n",
-    "    \n",
-    "    def __init__(self, value, extra=None, group_id=None):\n",
-    "        super(GenericModification, self).__init__(TagTypeEnum.generic, value, extra, group_id)\n",
-    "    \n",
-    "    def _format_main(self):\n",
-    "        return self.value\n",
-    "\n",
-    "    def resolve(self):\n",
-    "        '''Find the term, searching through all available vocabularies and\n",
-    "        return the first match's properties\n",
-    "        '''\n",
-    "        raise NotImplementedError()\n",
-    "\n",
-    "\n",
-    "class UnimodModification(ModificationBase):\n",
-    "    __slots__ = ()\n",
-    "    \n",
-    "    prefix_name = \"UNIMOD\"\n",
-    "    short_prefix = \"U\"\n",
-    "    _tag_type = TagTypeEnum.unimod\n",
-    "\n",
-    "\n",
-    "class PSIModModification(ModificationBase):\n",
-    "    __slots__ = ()\n",
-    "    \n",
-    "    prefix_name = \"MOD\"\n",
-    "    short_prefix = 'M'\n",
-    "    _tag_type = TagTypeEnum.psimod\n",
-    "\n",
-    "\n",
-    "class GNOmeModification(ModificationBase):\n",
-    "    __slots__ = ()\n",
-    "    \n",
-    "    prefix_name = \"GNO\"\n",
-    "    # short_prefix = 'G'\n",
-    "    _tag_type = TagTypeEnum.gnome\n",
-    "\n",
-    "    \n",
-    "class XLMODModification(ModificationBase):\n",
-    "    __slots__ = ()\n",
-    "    \n",
-    "    prefix_name = \"XLMOD\"\n",
-    "    # short_prefix = 'XL'\n",
-    "    _tag_type = TagTypeEnum.xlmod\n",
-    "\n",
-    "\n",
-    "def split_tags(tokens):\n",
-    "    '''Split a token array into discrete sets of tag\n",
-    "    tokens.\n",
-    "\n",
-    "    Parameters\n",
-    "    ----------\n",
-    "    tokens: list\n",
-    "        The characters of the tag token buffer\n",
-    "    \n",
-    "    Returns\n",
-    "    -------\n",
-    "    list of list:\n",
-    "        The tokens for each contained tag\n",
-    "    '''\n",
-    "    starts = [0]\n",
-    "    ends = []\n",
-    "    for i, c in enumerate(tokens):\n",
-    "        if c == '|':\n",
-    "            ends.append(i)\n",
-    "            starts.append(i + 1)\n",
-    "        elif (i != 0 and c == '#'):\n",
-    "            ends.append(i)\n",
-    "            starts.append(i)\n",
-    "    ends.append(len(tokens))\n",
-    "    out = []\n",
-    "    for i, start in enumerate(starts):\n",
-    "        end = ends[i]\n",
-    "        out.append(tokens[start:end])\n",
-    "    return out\n",
-    "\n",
-    "\n",
-    "def find_prefix(tokens):\n",
-    "    '''Find the prefix, if any of the tag defined by `tokens`\n",
-    "    delimited by \":\".\n",
-    "\n",
-    "    Parameters\n",
-    "    ----------\n",
-    "    tokens: list\n",
-    "        The tag tokens to search\n",
-    "    \n",
-    "    Returns\n",
-    "    -------\n",
-    "    prefix: str or None\n",
-    "        The prefix string, if found\n",
-    "    rest: str\n",
-    "        The rest of the tokens, merged as a string\n",
-    "    '''\n",
-    "    for i, c in enumerate(tokens):\n",
-    "        if c == ':':\n",
-    "            return ''.join(tokens[:i]), ''.join(tokens[i + 1:])\n",
-    "    return None, tokens\n",
-    "\n",
-    "def process_marker(tokens):\n",
-    "    '''Process a marker, which is a tag whose value starts with #.\n",
-    "\n",
-    "    Parameters\n",
-    "    ----------\n",
-    "    tokens: list\n",
-    "        The tag tokens to parse\n",
-    "\n",
-    "    Returns\n",
-    "    -------\n",
-    "    PositionLabelTag or LocalizationMarker\n",
-    "    '''\n",
-    "    if tokens[1:3] == 'XL':\n",
-    "        return PositionLabelTag(None, group_id=''.join(tokens))\n",
-    "    else:\n",
-    "        group_id = None\n",
-    "        value = None\n",
-    "        for i, c in  enumerate(tokens):\n",
-    "            if c == '(':\n",
-    "                group_id = ''.join(tokens[:i])\n",
-    "                if tokens[-1] != ')':\n",
-    "                    raise Exception(\"Localization marker with score missing closing parenthesis\")\n",
-    "                value = float(''.join(tokens[i + 1:-1]))\n",
-    "                return LocalizationMarker(value, group_id=group_id)\n",
-    "        else:\n",
-    "            group_id = ''.join(tokens)\n",
-    "            return PositionLabelTag(group_id=group_id)\n",
-    "        \n",
-    "\n",
-    "\n",
-    "def process_tag_tokens(tokens):\n",
-    "    '''Convert a tag token buffer into a parsed :class:`TagBase` instance\n",
-    "    of the appropriate sub-type with zero or more sub-tags.\n",
-    "\n",
-    "    Parameters\n",
-    "    ----------\n",
-    "    tokens: list\n",
-    "        The tokens to parse\n",
-    "    \n",
-    "    Returns\n",
-    "    -------\n",
-    "    TagBase:\n",
-    "        The parsed tag\n",
-    "    '''\n",
-    "    parts = split_tags(tokens)\n",
-    "    main_tag = parts[0]\n",
-    "    if main_tag[0] in ('+', '-'):\n",
-    "        main_tag = ''.join(main_tag)\n",
-    "        main_tag = MassModification(main_tag)\n",
-    "    elif main_tag[0] == '#':\n",
-    "        main_tag = process_marker(main_tag)\n",
-    "    else:\n",
-    "        prefix, value = find_prefix(main_tag)\n",
-    "        if prefix is None:\n",
-    "            main_tag = GenericModification(''.join(value))\n",
-    "        else:\n",
-    "            tag_type = TagBase.find_by_tag(prefix)\n",
-    "            main_tag = tag_type(value)\n",
-    "    if len(parts) > 1:\n",
-    "        extras = []\n",
-    "        for part in parts[1:]:\n",
-    "            prefix, value = find_prefix(part)\n",
-    "            if prefix is None:\n",
-    "                if value[0] == \"#\":\n",
-    "                    marker = process_marker(value)\n",
-    "                    if isinstance(marker, PositionLabelTag):\n",
-    "                        main_tag.group_id = ''.join(value)\n",
-    "                    else:\n",
-    "                        main_tag.group_id = marker.group_id\n",
-    "                        extras.append(marker)\n",
-    "                else:\n",
-    "                    extras.append(GenericModification(''.join(value)))\n",
-    "            else:\n",
-    "                tag_type = TagBase.find_by_tag(prefix)\n",
-    "                extras.append(tag_type(value))\n",
-    "        main_tag.extra = extras\n",
-    "    return main_tag"
+    "seq, props = proforma.parse_proforma(\"{Glycan:Hex 1 HexNAc 2 NeuAc 1}STYGIAN\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 228,
+   "execution_count": 43,
    "metadata": {},
    "outputs": [],
    "source": [
-    "class ModificationRule(object):\n",
-    "    '''Define a fixed modification rule which dictates a modification tag is\n",
-    "    always applied at one or more amino acid residues.\n",
-    "\n",
-    "    Attributes\n",
-    "    ----------\n",
-    "    modification_tag: TagBase\n",
-    "        The modification to apply\n",
-    "    targets: list\n",
-    "        The list of amino acids this applies to\n",
-    "    '''\n",
-    "    __slots__ = ('modification_tag', 'targets')\n",
-    "\n",
-    "    def __init__(self, modification_tag, targets=None):\n",
-    "        self.modification_tag = modification_tag\n",
-    "        self.targets = targets\n",
-    "\n",
-    "    def __eq__(self, other):\n",
-    "        if other is None:\n",
-    "            return False\n",
-    "        return self.modification_tag == other.modification_tag and self.targets == other.targets\n",
-    "\n",
-    "    def __ne__(self, other):\n",
-    "        return not self == other\n",
-    "\n",
-    "    def __str__(self):\n",
-    "        targets = ','.join(self.targets)\n",
-    "        return \"<{self.modification_tag}@{targets}>\".format(self=self, targets=targets)\n",
-    "\n",
-    "    def __repr__(self):\n",
-    "        return \"{self.__class__.__name__}({self.modification_tag!r}, {self.targets})\".format(self=self)\n",
-    "\n",
-    "\n",
-    "class StableIsotope(object):\n",
-    "    '''Define a fixed isotope that is applied globally to all amino acids.\n",
-    "\n",
-    "    Attributes\n",
-    "    ----------\n",
-    "    isotope: str\n",
-    "        The stable isotope string, of the form [<isotope-number>]<element> or a special\n",
-    "        isotopoform's name.\n",
-    "    '''\n",
-    "    __slots__ = ('isotope', )\n",
-    "\n",
-    "    def __init__(self, isotope):\n",
-    "        self.isotope = isotope\n",
-    "\n",
-    "    def __eq__(self, other):\n",
-    "        if other is None:\n",
-    "            return False\n",
-    "        return self.isotope == other.isotope\n",
-    "\n",
-    "    def __ne__(self, other):\n",
-    "        return not self == other\n",
-    "\n",
-    "    def __str__(self):\n",
-    "        return \"<{self.isotope}>\".format(self=self)\n",
-    "\n",
-    "    def __repr__(self):\n",
-    "        return \"{self.__class__.__name__}({self.isotope})\".format(self=self)\n",
-    "\n",
-    "\n",
-    "class TaggedInterval(object):\n",
-    "    '''Define a fixed interval over the associated sequence which contains the localization\n",
-    "    of the associated tag.\n",
-    "\n",
-    "    Attributes\n",
-    "    ----------\n",
-    "    start: int\n",
-    "        The starting position (inclusive) of the interval along the primary sequence\n",
-    "    end: int\n",
-    "        The ending position (exclusive) of the interval along the primary sequence\n",
-    "    tag: TagBase\n",
-    "        The tag being localized\n",
-    "    '''\n",
-    "    __slots__ = ('start', 'end', 'tag')\n",
-    "\n",
-    "    def __init__(self, start, end=None, tag=None):\n",
-    "        self.start = start\n",
-    "        self.end = end\n",
-    "        self.tag = tag\n",
-    "    \n",
-    "    def __eq__(self, other):\n",
-    "        if other is None:\n",
-    "            return False\n",
-    "        return self.start == other.start and self.end == other.end and self.tag == other.tag\n",
-    "\n",
-    "    def __ne__(self, other):\n",
-    "        return not self == other\n",
-    "\n",
-    "    def __str__(self):\n",
-    "        return \"({self.start}-{self.end}){self.tag!r}\".format(self=self)\n",
-    "\n",
-    "    def __repr__(self):\n",
-    "        return \"{self.__class__.__name__}({self.start}, {self.end}, {self.tag})\".format(self=self)\n"
+    "p = proforma.ProForma.parse(\"{Glycan:Hex1HexNAc2NeuAc1#g1}S[#g1]T[#g1]YGIANS[#g1]EQ\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 229,
+   "execution_count": 48,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "{'n_term': None,\n",
+       " 'c_term': None,\n",
+       " 'unlocalized_modifications': [],\n",
+       " 'labile_modifications': [GlycanModification('Hex1HexNAc2NeuAc1', None, None)],\n",
+       " 'fixed_modifications': [],\n",
+       " 'intervals': [],\n",
+       " 'isotopes': [],\n",
+       " 'group_ids': []}"
+      ]
+     },
+     "metadata": {},
+     "execution_count": 48
+    }
+   ],
    "source": [
-    "class TagParser(object):\n",
-    "    '''A parser which accumulates tokens until it is asked to parse them into\n",
-    "    :class:`TagBase` instances.\n",
-    "\n",
-    "    Implements a subset of the Sequence protocol.\n",
-    "\n",
-    "    Attributes\n",
-    "    ----------\n",
-    "    buffer: list\n",
-    "        The list of tokens accumulated since the last parsing.\n",
-    "    group_ids: set\n",
-    "        The set of all group IDs that have been produced so far.\n",
-    "    '''\n",
-    "\n",
-    "    def __init__(self, initial=None, group_ids=None):\n",
-    "        if initial:\n",
-    "            self.buffer = list(initial)\n",
-    "        else:\n",
-    "            self.buffer = []\n",
-    "        if group_ids:\n",
-    "            self.group_ids = set(group_ids)\n",
-    "        else:\n",
-    "            self.group_ids = set()\n",
-    "    \n",
-    "    def append(self, c):\n",
-    "        '''Append a new character to the buffer.\n",
-    "\n",
-    "        Parameters\n",
-    "        ----------\n",
-    "        c: str\n",
-    "            The character appended\n",
-    "        '''\n",
-    "        self.buffer.append(c)\n",
-    "    \n",
-    "    def reset(self):\n",
-    "        '''Discard the content of the current buffer.\n",
-    "        '''\n",
-    "        self.buffer = []\n",
-    "    \n",
-    "    def __bool__(self):\n",
-    "        return bool(self.buffer)\n",
-    "    \n",
-    "    def __iter__(self):\n",
-    "        return iter(self.buffer)\n",
-    "\n",
-    "    def __getitem__(self, i):\n",
-    "        return self.buffer[i]\n",
-    "    \n",
-    "    def __len__(self):\n",
-    "        return len(self.buffer)\n",
-    "\n",
-    "    def process(self):\n",
-    "        '''Parse the content of the internal buffer, clear the buffer,\n",
-    "        and return the parsed tag.\n",
-    "\n",
-    "        Returns\n",
-    "        -------\n",
-    "        TagBase\n",
-    "        '''\n",
-    "        tag = process_tag_tokens(self.buffer)\n",
-    "        if tag.group_id:\n",
-    "            self.group_ids.add(tag.group_id)\n",
-    "        self.reset()\n",
-    "        return tag"
+    "p.properties"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 25,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "1"
+      ]
+     },
+     "metadata": {},
+     "execution_count": 25
+    }
+   ],
    "source": []
   },
   {
    "cell_type": "code",
-   "execution_count": 230,
+   "execution_count": 12,
    "metadata": {},
-   "outputs": [],
-   "source": [
-    "class ParserStateEnum(Enum):\n",
-    "    before_sequence = 0\n",
-    "    tag_before_sequence = 1\n",
-    "    global_tag = 2\n",
-    "    fixed_spec = 3\n",
-    "    labile_tag = 4\n",
-    "    sequence = 5\n",
-    "    tag_in_sequence = 6\n",
-    "    interval_tag = 7\n",
-    "    tag_after_sequence = 8\n",
-    "    stable_isotope = 9\n",
-    "\n",
-    "    done = 999\n",
-    "\n",
-    "\n",
-    "BEFORE = ParserStateEnum.before_sequence\n",
-    "TAG_BEFORE = ParserStateEnum.tag_before_sequence\n",
-    "FIXED = ParserStateEnum.fixed_spec\n",
-    "GLOBAL = ParserStateEnum.global_tag\n",
-    "ISOTOPE = ParserStateEnum.stable_isotope\n",
-    "LABILE = ParserStateEnum.labile_tag\n",
-    "SEQ = ParserStateEnum.sequence\n",
-    "TAG = ParserStateEnum.tag_in_sequence\n",
-    "INTERVAL_TAG = ParserStateEnum.interval_tag\n",
-    "TAG_AFTER = ParserStateEnum.tag_after_sequence\n",
-    "DONE = ParserStateEnum.done\n",
-    "\n",
-    "VALID_AA = set(\"QWERTYIPASDFGHKLCVNM\")\n",
-    "\n",
-    "def parse_proforma(sequence):\n",
-    "    '''Tokenize a ProForma sequence into a sequence of amino acid+tag positions, and a\n",
-    "    mapping of sequence-spanning modifiers.\n",
-    "\n",
-    "    .. note::\n",
-    "        This is a state machine parser, but with certain sub-state paths\n",
-    "        unrolled to avoid an explosion of formal intermediary states.\n",
-    "\n",
-    "    Parameters\n",
-    "    ----------\n",
-    "    sequence: str\n",
-    "        The sequence to parse\n",
-    "    \n",
-    "    Returns\n",
-    "    -------\n",
-    "    parsed_sequence: list\n",
-    "        The (amino acid: str, TagBase or None) pairs denoting the positions along the primary sequence\n",
-    "    modifiers: dict\n",
-    "        A mapping listing the labile modifications, fixed modifications, stable isotopes, unlocalized\n",
-    "        modifications, tagged intervals, and group IDs\n",
-    "    '''\n",
-    "    labile_modifications = []\n",
-    "    fixed_modifications = []\n",
-    "    unlocalized_modifications = []\n",
-    "    intervals = []\n",
-    "    isotopes = []\n",
-    "    \n",
-    "    n_term = None\n",
-    "    c_term = None\n",
-    "    \n",
-    "    i = 0\n",
-    "    n = len(sequence)\n",
-    "    \n",
-    "    positions = []\n",
-    "    state = BEFORE\n",
-    "    depth = 0\n",
-    "    \n",
-    "    current_aa = None\n",
-    "    current_tag = TagParser()\n",
-    "    current_interval = None\n",
-    "    \n",
-    "    while i < n:\n",
-    "        c = sequence[i]\n",
-    "        i += 1\n",
-    "        if state == BEFORE:\n",
-    "            if c == '[':\n",
-    "                state = TAG_BEFORE\n",
-    "                depth = 1\n",
-    "            elif c == '{':\n",
-    "                state = LABILE\n",
-    "                depth = 1\n",
-    "            elif c == '<':\n",
-    "                state = FIXED\n",
-    "            elif c in VALID_AA:\n",
-    "                current_aa = c\n",
-    "                state = SEQ\n",
-    "            else:\n",
-    "                raise Exception(\"Error In State {state}, unexpected {c} found at index {i}\".format(**locals()))\n",
-    "        elif state == SEQ:\n",
-    "            if c in VALID_AA:\n",
-    "                positions.append((current_aa, current_tag.process() if current_tag else None))\n",
-    "                current_aa = c\n",
-    "            elif c == '[':\n",
-    "                state = TAG\n",
-    "                depth = 1\n",
-    "            elif c == '(':\n",
-    "                current_interval = TaggedInterval(len(positions) + 1)\n",
-    "            elif c == ')':\n",
-    "                if current_interval is None:\n",
-    "                    raise Exception(\"Error In State {state}, unexpected {c} found at index {i}\".format(**locals()))\n",
-    "                else:\n",
-    "                    current_interval.end = len(positions) + 1\n",
-    "                    if i >= n or sequence[i] != '[':\n",
-    "                        raise Exception(\"Missing Interval Tag\")\n",
-    "                    i += 1\n",
-    "                    depth = 1\n",
-    "                    state = INTERVAL_TAG\n",
-    "            elif c == '-':\n",
-    "                state = TAG_AFTER\n",
-    "                if i >= n or sequence[i] != '[':\n",
-    "                    raise Exception(\"Missing Interval Tag\")\n",
-    "                i += 1\n",
-    "                depth = 1                \n",
-    "            else:\n",
-    "                raise Exception(\"Error In State {state}, unexpected {c} found at index {i}\".format(**locals()))\n",
-    "        elif state == TAG or state == TAG_BEFORE or state == TAG_AFTER or state == GLOBAL:\n",
-    "            if c == '[':\n",
-    "                depth += 1\n",
-    "            elif c == ']':\n",
-    "                depth -= 1\n",
-    "                if depth <= 0:\n",
-    "                    depth = 0\n",
-    "                    if state == TAG: \n",
-    "                        state = SEQ\n",
-    "                    elif state == TAG_BEFORE:\n",
-    "                        if i < n:\n",
-    "                            cnext = sequence[i]\n",
-    "                            if cnext == '?':\n",
-    "                                unlocalized_modifications.append(current_tag.process())\n",
-    "                                i += 1\n",
-    "                            elif cnext == '-':\n",
-    "                                n_term = current_tag.process()\n",
-    "                                i += 1\n",
-    "                            else:\n",
-    "                                i += 1\n",
-    "                                raise Exception(\"Error In State {state}, unexpected {cnext} found at index {i}\".format(**locals()))\n",
-    "\n",
-    "                        state = BEFORE\n",
-    "                    elif state == TAG_AFTER:\n",
-    "                        c_term = current_tag.process()\n",
-    "                        state = DONE\n",
-    "                    elif state == GLOBAL:\n",
-    "                        # Gobble the rest of the global tag inline to avoid spawning\n",
-    "                        # a whole new state.\n",
-    "                        if i < n:\n",
-    "                            c = sequence[i]\n",
-    "                            i += 1\n",
-    "                            if c != '@':\n",
-    "                                raise Exception(\n",
-    "                                    (\"Error In State {state}, fixed modification detected without \"\n",
-    "                                    \"target amino acids found at index {i}\").format(**locals()))\n",
-    "                            end = 0\n",
-    "                            targets = []\n",
-    "                            while i < n:\n",
-    "                                c = sequence[i]\n",
-    "                                i += 1\n",
-    "                                if c in VALID_AA:\n",
-    "                                    targets.append(c)\n",
-    "                                elif c == ',':\n",
-    "                                    pass\n",
-    "                                elif '>':\n",
-    "                                    break\n",
-    "                            else:\n",
-    "                                raise Exception(\n",
-    "                                    (\"Error In State {state}, unclosed fixed modification rule\").format(**locals()))\n",
-    "\n",
-    "                        fixed_modifications.append(\n",
-    "                            ModificationRule(current_tag.process(), targets))\n",
-    "                        state = BEFORE\n",
-    "            else:\n",
-    "                current_tag.append(c)\n",
-    "        elif state == FIXED:\n",
-    "            if c == '[':\n",
-    "                state = GLOBAL\n",
-    "            else:\n",
-    "                state = ISOTOPE\n",
-    "                current_tag.append(c)\n",
-    "        elif state == ISOTOPE:\n",
-    "            if c != '>':\n",
-    "                current_tag.append(c)\n",
-    "            else:\n",
-    "                isotopes.append(StableIsotope(''.join(current_tag)))\n",
-    "                current_tag.reset()\n",
-    "                state = BEFORE\n",
-    "        elif state == LABILE:\n",
-    "            if c == '{':\n",
-    "                depth += 1\n",
-    "            elif c == '}':\n",
-    "                depth -= 1\n",
-    "                if depth <= 0:\n",
-    "                    depth = 0\n",
-    "                    labile_modifications.append(current_tag.process())\n",
-    "                    state = BEFORE\n",
-    "            else:\n",
-    "                current_tag.append(c)\n",
-    "        elif state == INTERVAL_TAG:\n",
-    "            if c == '[':\n",
-    "                depth += 1\n",
-    "            elif c == ']':\n",
-    "                depth -= 1\n",
-    "                if depth <= 0:\n",
-    "                    depth = 0\n",
-    "                    current_interval.tag = current_tag.process()\n",
-    "                    intervals.append(current_interval)\n",
-    "                    current_interval = None\n",
-    "                    state = SEQ\n",
-    "            else:\n",
-    "                current_tag.append(c)\n",
-    "        else:\n",
-    "            raise Exception(\"Error In State {state}, unexpected {c} found at index {i}\".format(**locals()))\n",
-    "    if state in (ISOTOPE, TAG, TAG_AFTER, TAG_BEFORE, LABILE, ):\n",
-    "        raise Exception(\"Error In State {state}, unclosed group reached end of string!\".format(**locals()))\n",
-    "    if current_aa:\n",
-    "        positions.append((current_aa, current_tag.process() if current_tag else None))\n",
-    "    return positions, {\n",
-    "        'n_term': n_term,\n",
-    "        'c_term': c_term,\n",
-    "        'unlocalized_modifications': unlocalized_modifications,\n",
-    "        'labile_modifications': labile_modifications,\n",
-    "        'fixed_modifications': fixed_modifications,\n",
-    "        'intervals': intervals,\n",
-    "        'isotopes': isotopes,\n",
-    "        'group_ids': list(current_tag.group_ids)\n",
-    "    }"
-   ]
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "{'mass': 841.2962353162999,\n",
+       " 'composition': Composition({'C': 29, 'H': 51, 'O': 22, 'N': 3}),\n",
+       " 'name': 'Hex 1 HexNAc 2 NeuAc 1',\n",
+       " 'monosaccharides': BasicComposition({'Hex': 1, 'HexNAc': 2, 'NeuAc': 1})}"
+      ]
+     },
+     "metadata": {},
+     "execution_count": 12
+    }
+   ],
+   "source": []
   },
   {
    "cell_type": "code",
-   "execution_count": 231,
+   "execution_count": 120,
    "metadata": {},
    "outputs": [
     {
@@ -836,7 +128,7 @@
      "data": {
       "text/plain": [
        "([('S', None),\n",
-       "  ('T', UnimodModification('Ox', None, None)),\n",
+       "  ('T', UnimodModification('Oxidation', None, None)),\n",
        "  ('E', None),\n",
        "  ('P', None),\n",
        "  ('P', None),\n",
@@ -845,26 +137,66 @@
        "  ('G', None)],\n",
        " {'n_term': GenericModification('Hex', None, None),\n",
        "  'c_term': None,\n",
-       "  'unlocalized_modifications': [GenericModification('Bar', None, None)],\n",
-       "  'labile_modifications': [GenericModification('Foo', None, None)],\n",
+       "  'unlocalized_modifications': [GenericModification('Hydroxyl', None, None)],\n",
+       "  'labile_modifications': [GenericModification('HexNAc', None, None)],\n",
        "  'fixed_modifications': [ModificationRule(GenericModification('Carbamidomethyl', None, None), ['C'])],\n",
-       "  'intervals': [TaggedInterval(2, 5, 18.0000)],\n",
+       "  'intervals': [TaggedInterval(2, 5, +18.15)],\n",
        "  'isotopes': [StableIsotope(13C)],\n",
        "  'group_ids': []})"
       ]
      },
      "metadata": {},
-     "execution_count": 231
+     "execution_count": 120
     }
    ],
    "source": [
-    "seq, fields = tokenize_proforma(\"<[Carbamidomethyl]@C><13C>[Bar]?{Foo}[Hex]-ST[U:Ox](EPP)[+18]ING\")\n",
+    "seq, fields = tokenize_proforma(\"<[Carbamidomethyl]@C><13C>[Hydroxyl]?{HexNAc}[Hex]-ST[U:Oxidation](EPP)[+18.15]ING\")\n",
     "seq, fields"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 232,
+   "execution_count": 121,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "'<[Carbamidomethyl]@C><13C>[Hydroxyl]?{HexNAc}[Hex]-ST[UNIMOD:Oxidation](EPP)[+18.15]ING'"
+      ]
+     },
+     "metadata": {},
+     "execution_count": 121
+    }
+   ],
+   "source": [
+    "format_proforma(seq, **fields)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 119,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "'1010100.00001'"
+      ]
+     },
+     "metadata": {},
+     "execution_count": 119
+    }
+   ],
+   "source": [
+    "'1010100.00001'.rstrip('0').rstrip('.')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 67,
    "metadata": {},
    "outputs": [
     {
@@ -889,7 +221,7 @@
       ]
      },
      "metadata": {},
-     "execution_count": 232
+     "execution_count": 67
     }
    ],
    "source": [
@@ -899,7 +231,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 233,
+   "execution_count": 69,
    "metadata": {},
    "outputs": [
     {
@@ -928,11 +260,11 @@
       ]
      },
      "metadata": {},
-     "execution_count": 233
+     "execution_count": 69
     }
    ],
    "source": [
-    "parse_proforma(\"[Phospho#s1]?EM[Oxidation]EVT[#s1(0.01)]S[#s1(0.09)]ES[#s1(0.90)]PEK\")"
+    "tokenize_proforma(\"[Phospho#s1]?EM[Oxidation]EVT[#s1(0.01)]S[#s1(0.09)]ES[#s1(0.90)]PEK\")"
    ]
   },
   {
@@ -1023,9 +355,13 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
+   "name": "python3",
+   "display_name": "Python 3.8.5 64-bit ('py38': conda)",
+   "metadata": {
+    "interpreter": {
+     "hash": "486495e7f81c8f11fe15f00929ebabe524f3fb730012655e6ba0fbdcd165e71e"
+    }
+   }
   },
   "language_info": {
    "codemirror_mode": {

From ed2cacf1949096ccdf1af4ffcd087e5f16997972 Mon Sep 17 00:00:00 2001
From: Joshua Klein <mobiusklein@gmail.com>
Date: Tue, 19 Jan 2021 08:46:54 -0500
Subject: [PATCH 07/27] Fix out-of-order monosaccharide formulae;Add support
 for the Obs tag;Flesh out the Generic modification resolver;

---
 pyteomics/proforma.py | 94 ++++++++++++++++++++++++++++++++++---------
 1 file changed, 74 insertions(+), 20 deletions(-)

diff --git a/pyteomics/proforma.py b/pyteomics/proforma.py
index 6b4b39c1..e15b1a29 100644
--- a/pyteomics/proforma.py
+++ b/pyteomics/proforma.py
@@ -36,12 +36,11 @@
     # Python 2 doesn't have a builtin Enum type
     Enum = object
 
-from six import add_metaclass
 
 from pyteomics import parser
-from pyteomics.mass import Composition, std_aa_mass
+from pyteomics.mass import Composition, std_aa_mass, Unimod
 from pyteomics.auxiliary import PyteomicsError, BasicComposition
-from pyteomics.mass import Unimod
+from pyteomics.auxiliary.utils import add_metaclass
 
 # To eventually be implemented with pyteomics port?
 try:
@@ -56,6 +55,10 @@ def _needs_psims(name):
     obo_cache = None
 
 
+std_aa_mass = std_aa_mass.copy()
+std_aa_mass['X'] = 0
+
+
 class ProFormaError(PyteomicsError):
     def __init__(self, message, index=None, parser_state=None, **kwargs):
         super(ProFormaError, self).__init__(PyteomicsError, message, index, parser_state)
@@ -142,7 +145,7 @@ def __str__(self):
         else:
             label = part
         if self.group_id:
-            label = '%s|%s' % (label, self.group_id)
+            label = '%s%s' % (label, self.group_id)
         return '%s' % label
 
     def __repr__(self):
@@ -167,6 +170,10 @@ def find_extra(self, label):
                 out.append(e)
         return out
 
+    @classmethod
+    def parse(cls, buffer):
+        return process_tag_tokens(buffer)
+
 
 class GroupLabelBase(TagBase):
     __slots__ = ()
@@ -189,8 +196,9 @@ class PositionLabelTag(GroupLabelBase):
 
     def __init__(self, value=None, extra=None, group_id=None):
         assert group_id is not None
+        value = group_id
         super(PositionLabelTag, self).__init__(
-            TagTypeEnum.position_label, group_id, extra, group_id)
+            TagTypeEnum.position_label, value, extra, group_id)
 
     def _format_main(self):
         return "{self.group_id}".format(self=self)
@@ -230,17 +238,24 @@ class MassModification(TagBase):
 
     The value of a :class:`MassModification` is always a :class:`float`
     '''
-    __slots__ = ()
+    __slots__ = ('_significant_figures', )
+
+    prefix_name = "Obs"
 
     def __init__(self, value, extra=None, group_id=None):
+        if isinstance(value, str):
+            sigfigs = len(value.split('.')[-1].rstrip('0'))
+        else:
+            sigfigs = 4
+        self._significant_figures = sigfigs
         super(MassModification, self).__init__(
             TagTypeEnum.massmod, float(value), extra, group_id)
 
     def _format_main(self):
         if self.value >= 0:
-            return ('+%0.4g' % self.value).rstrip('0').rstrip('.')
+            return ('+{0:0.{1}f}'.format(self.value, self._significant_figures)).rstrip('0').rstrip('.')
         else:
-            return ('%0.4g' % self.value).rstrip('0').rstrip('.')
+            return ('{0:0.{1}f}'.format(self.value, self._significant_figures)).rstrip('0').rstrip('.')
 
     @property
     def mass(self):
@@ -248,7 +263,7 @@ def mass(self):
 
 
 class ModificationResolver(object):
-    def __init__(self, name, *args, **kwargs):
+    def __init__(self, name, **kwargs):
         self.name = name
         self._database = None
 
@@ -269,8 +284,8 @@ def __call__(self, name=None, id=None, **kwargs):
 
 
 class UnimodResolver(ModificationResolver):
-    def __init__(self, *args, **kwargs):
-        super(UnimodResolver, self).__init__("unimod", *args, **kwargs)
+    def __init__(self, **kwargs):
+        super(UnimodResolver, self).__init__("unimod", **kwargs)
         self._database = kwargs.get("database")
 
     def load_database(self):
@@ -299,8 +314,8 @@ def resolve(self, name=None, id=None, **kwargs):
 
 
 class PSIModResolver(ModificationResolver):
-    def __init__(self, *args, **kwargs):
-        super(PSIModResolver, self).__init__('psimod', *args, **kwargs)
+    def __init__(self, **kwargs):
+        super(PSIModResolver, self).__init__('psimod', **kwargs)
         self._database = kwargs.get("database")
 
     def load_database(self):
@@ -325,8 +340,8 @@ def resolve(self, name=None, id=None, **kwargs):
 
 
 class XLMODResolver(ModificationResolver):
-    def __init__(self, *args, **kwargs):
-        super(XLMODResolver, self).__init__('xlmod', *args, **kwargs)
+    def __init__(self, **kwargs):
+        super(XLMODResolver, self).__init__('xlmod', **kwargs)
         self._database = kwargs.get("database")
 
     def load_database(self):
@@ -353,16 +368,43 @@ def resolve(self, name=None, id=None, **kwargs):
             'provider': self.name
         }
 
-
+# TODO: Implement resolve walking up the graph to get the mass. Can't really
+# get any more information without glypy/glyspace interaction
 class GNOResolver(ModificationResolver):
-    def __init__(self, *args, **kwargs):
-        super(GNOResolver, self).__init__('gnome', *args, **kwargs)
+    def __init__(self, **kwargs):
+        super(GNOResolver, self).__init__('gnome', **kwargs)
         self._database = kwargs.get("database")
 
     def load_database(self):
         return load_gno()
 
 
+class GenericResolver(ModificationResolver):
+
+    def __init__(self, resolvers, **kwargs):
+        super(GenericResolver, self).__init__('generic', **kwargs)
+        self.resolvers = list(resolvers)
+
+    def load_database(self):
+        return None
+
+    def resolve(self, name=None, id=None, **kwargs):
+        defn = None
+        for resolver in self.resolvers:
+            try:
+                defn = resolver(name=name, id=id, **kwargs)
+            except (KeyError):
+                continue
+        if defn is None:
+            if name is None:
+                raise KeyError(id)
+            elif id is None:
+                raise KeyError(name)
+            else:
+                raise ValueError("Must provide one of `name` or `id`")
+        return defn
+
+
 class ModificationBase(TagBase):
     '''A base class for all modification tags with marked prefixes.
     '''
@@ -451,8 +493,8 @@ class GlycanModification(ModificationBase):
 
     valid_monosaccharides = {
         "Hex": (162.0528, Composition("C6H10O5")),
-        "HexNAc": (203.0793, Composition("C6H13N1O5")),
-        "HexS": (242.009, Composition("C8H10O8S1")),
+        "HexNAc": (203.0793, Composition("C8H13N1O5")),
+        "HexS": (242.009, Composition("C6H10O8S1")),
         "HexP": (242.0191, Composition("C6H11O8P1")),
         "HexNAcS": (283.0361, Composition("C8H13N1O8S1")),
         "dHex": (146.0579, Composition("C6H10O4")),
@@ -464,6 +506,10 @@ class GlycanModification(ModificationBase):
 
     tokenizer = re.compile(r"([A-Za-z]+)\s*(\d*)\s*")
 
+    @property
+    def monosaccharides(self):
+        return self.definition.get('monosaccharides')
+
     def resolve(self):
         composite = BasicComposition()
         for tok, cnt in self.tokenizer.findall(self.value):
@@ -511,6 +557,8 @@ class PSIModModification(ModificationBase):
 class GNOmeModification(ModificationBase):
     __slots__ = ()
 
+    resolver = GNOResolver()
+
     prefix_name = "GNO"
     # short_prefix = 'G'
     _tag_type = TagTypeEnum.gnome
@@ -529,6 +577,12 @@ class XLMODModification(ModificationBase):
 class GenericModification(ModificationBase):
     __slots__ = ()
     _tag_type = TagTypeEnum.generic
+    resolver = GenericResolver([
+        UnimodModification.resolver,
+        PSIModModification.resolver,
+        XLMODModification.resolver,
+        GNOmeModification.resolver,
+    ])
 
     def __init__(self, value, extra=None, group_id=None):
         super(GenericModification, self).__init__(

From 527b8209f4fb5dcb6a50120a7c8d8e05472943ad Mon Sep 17 00:00:00 2001
From: Joshua Klein <mobiusklein@gmail.com>
Date: Tue, 23 Mar 2021 13:48:12 -0400
Subject: [PATCH 08/27] Add multimod example

---
 proforma_parsing.ipynb | 44 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 42 insertions(+), 2 deletions(-)

diff --git a/proforma_parsing.ipynb b/proforma_parsing.ipynb
index dfca8348..f457d854 100644
--- a/proforma_parsing.ipynb
+++ b/proforma_parsing.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -22,7 +22,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -34,6 +34,46 @@
     "format_proforma = proforma.to_proforma"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "([('P', None),\n",
+       "  ('A', None),\n",
+       "  ('R', None),\n",
+       "  ('S', GenericModification('PhosphoPhospho', None, None)),\n",
+       "  ('E', None),\n",
+       "  ('R', None)],\n",
+       " {'n_term': None,\n",
+       "  'c_term': None,\n",
+       "  'unlocalized_modifications': [],\n",
+       "  'labile_modifications': [],\n",
+       "  'fixed_modifications': [],\n",
+       "  'intervals': [],\n",
+       "  'isotopes': [],\n",
+       "  'group_ids': []})"
+      ]
+     },
+     "metadata": {},
+     "execution_count": 6
+    }
+   ],
+   "source": [
+    "proforma.parse_proforma(\"PARS[Phospho][Phospho]ER\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": 41,

From 4afde59668e43c9f30253e3a1ee3e25e4496ac11 Mon Sep 17 00:00:00 2001
From: Joshua Klein <mobiusklein@gmail.com>
Date: Sun, 28 Mar 2021 22:48:57 -0400
Subject: [PATCH 09/27] Prepping for draft PR

---
 proforma_parsing.ipynb | 421 -----------------------------------------
 pyteomics/proforma.py  |  24 ++-
 2 files changed, 22 insertions(+), 423 deletions(-)
 delete mode 100644 proforma_parsing.ipynb

diff --git a/proforma_parsing.ipynb b/proforma_parsing.ipynb
deleted file mode 100644
index f457d854..00000000
--- a/proforma_parsing.ipynb
+++ /dev/null
@@ -1,421 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import re\n",
-    "from collections import namedtuple, defaultdict\n",
-    "\n",
-    "try:\n",
-    "    from enum import Enum\n",
-    "except ImportError:\n",
-    "    # Python 2 doesn't have a builtin Enum type\n",
-    "    Enum = object\n",
-    "\n",
-    "from six import add_metaclass\n",
-    "\n",
-    "from pyteomics import parser"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import importlib\n",
-    "from pyteomics import proforma\n",
-    "importlib.reload(proforma)\n",
-    "\n",
-    "tokenize_proforma = proforma.parse_proforma\n",
-    "format_proforma = proforma.to_proforma"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "output_type": "execute_result",
-     "data": {
-      "text/plain": [
-       "([('P', None),\n",
-       "  ('A', None),\n",
-       "  ('R', None),\n",
-       "  ('S', GenericModification('PhosphoPhospho', None, None)),\n",
-       "  ('E', None),\n",
-       "  ('R', None)],\n",
-       " {'n_term': None,\n",
-       "  'c_term': None,\n",
-       "  'unlocalized_modifications': [],\n",
-       "  'labile_modifications': [],\n",
-       "  'fixed_modifications': [],\n",
-       "  'intervals': [],\n",
-       "  'isotopes': [],\n",
-       "  'group_ids': []})"
-      ]
-     },
-     "metadata": {},
-     "execution_count": 6
-    }
-   ],
-   "source": [
-    "proforma.parse_proforma(\"PARS[Phospho][Phospho]ER\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 41,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "seq, props = proforma.parse_proforma(\"{Glycan:Hex 1 HexNAc 2 NeuAc 1}STYGIAN\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 43,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "p = proforma.ProForma.parse(\"{Glycan:Hex1HexNAc2NeuAc1#g1}S[#g1]T[#g1]YGIANS[#g1]EQ\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 48,
-   "metadata": {},
-   "outputs": [
-    {
-     "output_type": "execute_result",
-     "data": {
-      "text/plain": [
-       "{'n_term': None,\n",
-       " 'c_term': None,\n",
-       " 'unlocalized_modifications': [],\n",
-       " 'labile_modifications': [GlycanModification('Hex1HexNAc2NeuAc1', None, None)],\n",
-       " 'fixed_modifications': [],\n",
-       " 'intervals': [],\n",
-       " 'isotopes': [],\n",
-       " 'group_ids': []}"
-      ]
-     },
-     "metadata": {},
-     "execution_count": 48
-    }
-   ],
-   "source": [
-    "p.properties"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 25,
-   "metadata": {},
-   "outputs": [
-    {
-     "output_type": "execute_result",
-     "data": {
-      "text/plain": [
-       "1"
-      ]
-     },
-     "metadata": {},
-     "execution_count": 25
-    }
-   ],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [
-    {
-     "output_type": "execute_result",
-     "data": {
-      "text/plain": [
-       "{'mass': 841.2962353162999,\n",
-       " 'composition': Composition({'C': 29, 'H': 51, 'O': 22, 'N': 3}),\n",
-       " 'name': 'Hex 1 HexNAc 2 NeuAc 1',\n",
-       " 'monosaccharides': BasicComposition({'Hex': 1, 'HexNAc': 2, 'NeuAc': 1})}"
-      ]
-     },
-     "metadata": {},
-     "execution_count": 12
-    }
-   ],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 120,
-   "metadata": {},
-   "outputs": [
-    {
-     "output_type": "execute_result",
-     "data": {
-      "text/plain": [
-       "([('S', None),\n",
-       "  ('T', UnimodModification('Oxidation', None, None)),\n",
-       "  ('E', None),\n",
-       "  ('P', None),\n",
-       "  ('P', None),\n",
-       "  ('I', None),\n",
-       "  ('N', None),\n",
-       "  ('G', None)],\n",
-       " {'n_term': GenericModification('Hex', None, None),\n",
-       "  'c_term': None,\n",
-       "  'unlocalized_modifications': [GenericModification('Hydroxyl', None, None)],\n",
-       "  'labile_modifications': [GenericModification('HexNAc', None, None)],\n",
-       "  'fixed_modifications': [ModificationRule(GenericModification('Carbamidomethyl', None, None), ['C'])],\n",
-       "  'intervals': [TaggedInterval(2, 5, +18.15)],\n",
-       "  'isotopes': [StableIsotope(13C)],\n",
-       "  'group_ids': []})"
-      ]
-     },
-     "metadata": {},
-     "execution_count": 120
-    }
-   ],
-   "source": [
-    "seq, fields = tokenize_proforma(\"<[Carbamidomethyl]@C><13C>[Hydroxyl]?{HexNAc}[Hex]-ST[U:Oxidation](EPP)[+18.15]ING\")\n",
-    "seq, fields"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 121,
-   "metadata": {},
-   "outputs": [
-    {
-     "output_type": "execute_result",
-     "data": {
-      "text/plain": [
-       "'<[Carbamidomethyl]@C><13C>[Hydroxyl]?{HexNAc}[Hex]-ST[UNIMOD:Oxidation](EPP)[+18.15]ING'"
-      ]
-     },
-     "metadata": {},
-     "execution_count": 121
-    }
-   ],
-   "source": [
-    "format_proforma(seq, **fields)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 119,
-   "metadata": {},
-   "outputs": [
-    {
-     "output_type": "execute_result",
-     "data": {
-      "text/plain": [
-       "'1010100.00001'"
-      ]
-     },
-     "metadata": {},
-     "execution_count": 119
-    }
-   ],
-   "source": [
-    "'1010100.00001'.rstrip('0').rstrip('.')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 67,
-   "metadata": {},
-   "outputs": [
-    {
-     "output_type": "execute_result",
-     "data": {
-      "text/plain": [
-       "([('S', None),\n",
-       "  ('E', None),\n",
-       "  ('P', None),\n",
-       "  ('P', None),\n",
-       "  ('I', None),\n",
-       "  ('N', None),\n",
-       "  ('G', None)],\n",
-       " {'n_term': None,\n",
-       "  'c_term': None,\n",
-       "  'unlocalized_modifications': [],\n",
-       "  'labile_modifications': [],\n",
-       "  'fixed_modifications': [],\n",
-       "  'intervals': [TaggedInterval(1, 4, 18.0000)],\n",
-       "  'isotopes': [],\n",
-       "  'group_ids': []})"
-      ]
-     },
-     "metadata": {},
-     "execution_count": 67
-    }
-   ],
-   "source": [
-    "seq, fields = tokenize_proforma(\"S(EPP)[+18]ING\")\n",
-    "seq, fields"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 69,
-   "metadata": {},
-   "outputs": [
-    {
-     "output_type": "execute_result",
-     "data": {
-      "text/plain": [
-       "([('E', None),\n",
-       "  ('M', GenericModification('Oxidation', None, None)),\n",
-       "  ('E', None),\n",
-       "  ('V', None),\n",
-       "  ('T', LocalizationMarker(0.01, None, '#s1')),\n",
-       "  ('S', LocalizationMarker(0.09, None, '#s1')),\n",
-       "  ('E', None),\n",
-       "  ('S', LocalizationMarker(0.9, None, '#s1')),\n",
-       "  ('P', None),\n",
-       "  ('E', None),\n",
-       "  ('K', None)],\n",
-       " {'n_term': None,\n",
-       "  'c_term': None,\n",
-       "  'unlocalized_modifications': [GenericModification('Phospho', [], '#s1')],\n",
-       "  'labile_modifications': [],\n",
-       "  'fixed_modifications': [],\n",
-       "  'intervals': [],\n",
-       "  'isotopes': [],\n",
-       "  'group_ids': ['#s1']})"
-      ]
-     },
-     "metadata": {},
-     "execution_count": 69
-    }
-   ],
-   "source": [
-    "tokenize_proforma(\"[Phospho#s1]?EM[Oxidation]EVT[#s1(0.01)]S[#s1(0.09)]ES[#s1(0.90)]PEK\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 234,
-   "metadata": {},
-   "outputs": [
-    {
-     "output_type": "execute_result",
-     "data": {
-      "text/plain": [
-       "([('E', None),\n",
-       "  ('M', GenericModification('Oxidation', None, None)),\n",
-       "  ('E', None),\n",
-       "  ('V', None),\n",
-       "  ('T', LocalizationMarker(0.01, None, '#g1')),\n",
-       "  ('S', LocalizationMarker(0.09, None, '#g1')),\n",
-       "  ('E', None),\n",
-       "  ('S',\n",
-       "   GenericModification('Phospho', [LocalizationMarker(0.9, None, '#g1')], '#g1')),\n",
-       "  ('P', None),\n",
-       "  ('E', None),\n",
-       "  ('K', None)],\n",
-       " {'n_term': None,\n",
-       "  'c_term': None,\n",
-       "  'unlocalized_modifications': [],\n",
-       "  'labile_modifications': [],\n",
-       "  'fixed_modifications': [],\n",
-       "  'intervals': [],\n",
-       "  'isotopes': [],\n",
-       "  'group_ids': ['#g1']})"
-      ]
-     },
-     "metadata": {},
-     "execution_count": 234
-    }
-   ],
-   "source": [
-    "tokenize_proforma(\"EM[Oxidation]EVT[#g1(0.01)]S[#g1(0.09)]ES[Phospho#g1(0.90)]PEK\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 235,
-   "metadata": {},
-   "outputs": [
-    {
-     "output_type": "execute_result",
-     "data": {
-      "text/plain": [
-       "([('E', None),\n",
-       "  ('M', None),\n",
-       "  ('E', None),\n",
-       "  ('V', None),\n",
-       "  ('T', LocalizationMarker(0.01, None, '#g1')),\n",
-       "  ('S', LocalizationMarker(0.09, None, '#g1')),\n",
-       "  ('E', None),\n",
-       "  ('S',\n",
-       "   GlycanModification('HexNAc 1', [LocalizationMarker(0.9, None, '#g1')], '#g1')),\n",
-       "  ('P', None),\n",
-       "  ('E', None),\n",
-       "  ('K', None)],\n",
-       " {'n_term': None,\n",
-       "  'c_term': None,\n",
-       "  'unlocalized_modifications': [],\n",
-       "  'labile_modifications': [],\n",
-       "  'fixed_modifications': [],\n",
-       "  'intervals': [],\n",
-       "  'isotopes': [],\n",
-       "  'group_ids': ['#g1']})"
-      ]
-     },
-     "metadata": {},
-     "execution_count": 235
-    }
-   ],
-   "source": [
-    "tokenize_proforma(\"EMEVT[#g1(0.01)]S[#g1(0.09)]ES[Glycan:HexNAc 1#g1(0.90)]PEK\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "name": "python3",
-   "display_name": "Python 3.8.5 64-bit ('py38': conda)",
-   "metadata": {
-    "interpreter": {
-     "hash": "486495e7f81c8f11fe15f00929ebabe524f3fb730012655e6ba0fbdcd165e71e"
-    }
-   }
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.5-final"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
\ No newline at end of file
diff --git a/pyteomics/proforma.py b/pyteomics/proforma.py
index e15b1a29..c0fe755d 100644
--- a/pyteomics/proforma.py
+++ b/pyteomics/proforma.py
@@ -1,6 +1,18 @@
-'''A simple ProForma lexer
+'''
+proforma - Proteoform and Peptidoform Notation
+==============================================
+
+ProForma is a notation for defining modified amino acid sequences using
+a set of controlled vocabularies, as well as encoding uncertain or partial
+information about localization. See `ProForma specification <https://www.psidev.info/proforma>`_
+for more up-to-date information.
+
+Strictly speaking, this implementation supports ProForma v2.
+
+Data Access
+-----------
 
-The primary interface is through :func:`parse_proforma`:
+:py:func:`parse_proforma` - The primary interface for parsing ProForma strings.
 
     >>> parse_proforma("EM[Oxidation]EVT[#g1(0.01)]S[#g1(0.09)]ES[Phospho#g1(0.90)]PEK")
         ([('E', None),
@@ -24,6 +36,14 @@
           'isotopes': [],
           'group_ids': ['#g1']})
 
+:py:func:`to_proforma` - Format a sequence and set of properties as ProForma text.
+
+
+Classes
+-------
+
+:py:class:`ProForma` - An object oriented version of the parsing and formatting code,
+coupled with minimal information about mass and position data.
 '''
 
 import re

From 34d36dbece0a870400241f20b7e16aa73a40dc5f Mon Sep 17 00:00:00 2001
From: Joshua Klein <mobiusklein@gmail.com>
Date: Sun, 23 May 2021 23:09:56 -0400
Subject: [PATCH 10/27] Add support for multiple tags per position, add tests,
 and fix some lingering issues.

---
 pyteomics/proforma.py  | 281 +++++++++++++++++++++++++++++++++--------
 tests/test_proforma.py |  65 ++++++++++
 2 files changed, 294 insertions(+), 52 deletions(-)
 create mode 100644 tests/test_proforma.py

diff --git a/pyteomics/proforma.py b/pyteomics/proforma.py
index c0fe755d..49df3ed0 100644
--- a/pyteomics/proforma.py
+++ b/pyteomics/proforma.py
@@ -47,6 +47,7 @@
 '''
 
 import re
+import warnings
 from collections import namedtuple, defaultdict, deque
 from functools import partial
 
@@ -175,18 +176,35 @@ def __repr__(self):
     def __eq__(self, other):
         if other is None:
             return False
+        if isinstance(other, str):
+            return str(self) == other
         return (self.type == other.type) and (self.value == other.value) and (self.extra == other.extra) \
             and (self.group_id == other.group_id)
 
     def __ne__(self, other):
         return not self == other
 
-    def find_extra(self, label):
+    def find_tag_type(self, tag_type):
+        '''Search this tag or tag collection for elements with a particular
+        tag type and return them.
+
+        Parameters
+        ----------
+        tag_type : TagTypeEnum
+            A label from :class:`TagTypeEnum`, or an equivalent type.
+
+        Returns
+        -------
+        matches : list
+            The list of all tags in this object which match the requested tag type.
+        '''
         out = []
+        if self.type == tag_type:
+            out.append(self)
         if not self.extra:
             return out
         for e in self.extra:
-            if e.type == label:
+            if e.type == tag_type:
                 out.append(e)
         return out
 
@@ -307,15 +325,27 @@ class UnimodResolver(ModificationResolver):
     def __init__(self, **kwargs):
         super(UnimodResolver, self).__init__("unimod", **kwargs)
         self._database = kwargs.get("database")
+        self.strict = kwargs.get("strict", True)
 
     def load_database(self):
         return Unimod()
 
     def resolve(self, name=None, id=None, **kwargs):
+        strict = kwargs.get("strict", self.strict)
+        exhaustive = kwargs.get("exhaustive", True)
         if name is not None:
-            defn = self.database.by_title(name)
+            defn = self.database.by_title(name, strict=strict)
             if not defn:
-                defn = self.database.by_name(name)
+                defn = self.database.by_name(name, strict=strict)
+            if not defn and exhaustive and strict:
+                defn = self.database.by_title(name, strict=False)
+                if not defn:
+                    defn = self.database.by_name(name, strict=False)
+            if defn and isinstance(defn, list):
+                warnings.warn(
+                    "Multiple matches found for {!r} in Unimod, taking the first, {}.".format(
+                        name, defn[0]['record_id']))
+                defn = defn[0]
             if not defn:
                 raise KeyError(name)
         elif id is not None:
@@ -342,9 +372,9 @@ def load_database(self):
         return load_psimod()
 
     def resolve(self, name=None, id=None, **kwargs):
-        if name is None:
+        if name is not None:
             defn = self.database[name]
-        elif id is None:
+        elif id is not None:
             defn = self.database['MOD:{:05d}'.format(id)]
         else:
             raise ValueError("Must provide one of `name` or `id`")
@@ -368,9 +398,9 @@ def load_database(self):
         return load_psimod()
 
     def resolve(self, name=None, id=None, **kwargs):
-        if name is None:
+        if name is not None:
             defn = self.database[name]
-        elif id is None:
+        elif id is not None:
             defn = self.database['XLMOD:{:05d}'.format(id)]
         else:
             raise ValueError("Must provide one of `name` or `id`")
@@ -391,6 +421,8 @@ def resolve(self, name=None, id=None, **kwargs):
 # TODO: Implement resolve walking up the graph to get the mass. Can't really
 # get any more information without glypy/glyspace interaction
 class GNOResolver(ModificationResolver):
+    mass_pattern = re.compile(r"(\d+(:?\.\d+)) Da")
+
     def __init__(self, **kwargs):
         super(GNOResolver, self).__init__('gnome', **kwargs)
         self._database = kwargs.get("database")
@@ -398,6 +430,38 @@ def __init__(self, **kwargs):
     def load_database(self):
         return load_gno()
 
+    def get_mass_from_term(self, term):
+        root_id = 'GNO:00000001'
+        parent = term.parent()
+        if isinstance(parent, list):
+            parent = parent[0]
+        while parent.id != root_id:
+            next_parent = term.parent()
+            if isinstance(next_parent, list):
+                next_parent = next_parent[0]
+            if next_parent.id == root_id:
+                break
+            parent = next_parent
+        match = self.mass_pattern.search(parent.name)
+        if not match:
+            return None
+        return float(match.group(1))
+
+    def resolve(self, name=None, id=None, **kwargs):
+        if name is not None:
+            term = self.database[name]
+        elif id is not None:
+            term = self.database[id]
+        else:
+            raise ValueError("Must provide one of `name` or `id`")
+        rec = {
+            "name":term.name,
+            "id": term.id,
+            "provider": self.name,
+            "composition": None,
+            "mass": self.get_mass_from_term(term)
+        }
+
 
 class GenericResolver(ModificationResolver):
 
@@ -598,10 +662,15 @@ class GenericModification(ModificationBase):
     __slots__ = ()
     _tag_type = TagTypeEnum.generic
     resolver = GenericResolver([
-        UnimodModification.resolver,
+        # Do exact matching here first. Then default to non-strict matching as a final
+        # correction effort.
+        partial(UnimodModification.resolver, exhaustive=False),
         PSIModModification.resolver,
         XLMODModification.resolver,
         GNOmeModification.resolver,
+        # Some really common names aren't actually found in the XML exactly, so default
+        # to non-strict matching now to avoid masking other sources here.
+        partial(UnimodModification.resolver, strict=False)
     ])
 
     def __init__(self, value, extra=None, group_id=None):
@@ -862,6 +931,9 @@ def __str__(self):
     def __repr__(self):
         return "{self.__class__.__name__}({self.start}, {self.end}, {self.tag})".format(self=self)
 
+    def as_slice(self):
+        return slice(self.start, self.end)
+
 
 class TokenBuffer(object):
     '''A token buffer that wraps the accumulation and reset logic
@@ -876,6 +948,7 @@ class TokenBuffer(object):
     '''
     def __init__(self, initial=None):
         self.buffer = list(initial or [])
+        self.boundaries = []
 
     def append(self, c):
         '''Append a new character to the buffer.
@@ -890,7 +963,10 @@ def append(self, c):
     def reset(self):
         '''Discard the content of the current buffer.
         '''
-        self.buffer = []
+        if self.buffer:
+            self.buffer = []
+        if self.boundaries:
+            self.boundaries = []
 
     def __bool__(self):
         return bool(self.buffer)
@@ -904,11 +980,31 @@ def __getitem__(self, i):
     def __len__(self):
         return len(self.buffer)
 
+    def tokenize(self):
+        i = 0
+        pieces = []
+        for k in self.boundaries + [len(self)]:
+            piece = self.buffer[i:k]
+            i = k
+            pieces.append(piece)
+        return pieces
+
+    def _transform(self, value):
+        return value
+
     def process(self):
-        value = self.buffer
+        if self.boundaries:
+            value = [self._transform(v) for v in self.tokenize()]
+        else:
+            value = self._transform(self.buffer)
         self.reset()
         return value
 
+    def bound(self):
+        k = len(self)
+        self.boundaries.append(k)
+        return k
+
     def __call__(self):
         return self.process()
 
@@ -924,10 +1020,9 @@ class NumberParser(TokenBuffer):
     buffer: list
         The list of tokens accumulated since the last parsing.
     '''
-    def process(self):
-        value = int(''.join(self))
-        self.reset()
-        return  value
+
+    def _transform(self, value):
+        return int(''.join(value))
 
 
 class TagParser(TokenBuffer):
@@ -951,20 +1046,18 @@ def __init__(self, initial=None, group_ids=None):
         else:
             self.group_ids = set()
 
-    def process(self):
-        '''Parse the content of the internal buffer, clear the buffer,
-        and return the parsed tag.
-
-        Returns
-        -------
-        TagBase
-        '''
-        tag = process_tag_tokens(self.buffer)
+    def _transform(self, value):
+        tag = process_tag_tokens(value)
         if tag.group_id:
             self.group_ids.add(tag.group_id)
-        self.reset()
         return tag
 
+    def process(self):
+        value = super().process()
+        if not isinstance(value, list):
+            value = [value]
+        return value
+
 
 class ParserStateEnum(Enum):
     before_sequence = 0
@@ -981,6 +1074,7 @@ class ParserStateEnum(Enum):
     unlocalized_count = 11
     post_global = 12
     post_global_aa = 13
+    post_interval_tag = 14
     done = 999
 
 
@@ -998,6 +1092,7 @@ class ParserStateEnum(Enum):
 UNLOCALIZED_COUNT = ParserStateEnum.unlocalized_count
 POST_GLOBAL = ParserStateEnum.post_global
 POST_GLOBAL_AA = ParserStateEnum.post_global_aa
+POST_INTERVAL_TAG = ParserStateEnum.post_interval_tag
 DONE = ParserStateEnum.done
 
 VALID_AA = set("QWERTYIPASDFGHKLCVNM")
@@ -1065,18 +1160,28 @@ def parse_proforma(sequence):
                     "Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state)
         elif state == SEQ:
             if c in VALID_AA:
-                positions.append((current_aa, current_tag.process() if current_tag else None))
+                positions.append((current_aa, current_tag() if current_tag else None))
                 current_aa = c
             elif c == '[':
                 state = TAG
+                if current_tag:
+                    current_tag.bound()
                 depth = 1
             elif c == '(':
+                if current_interval is not None:
+                    raise ProFormaError(
+                        ("Error In State {state}, nested range found at index {i}. "
+                         "Nested ranges are not yet supported by ProForma.").format(
+                            **locals()), i, state)
                 current_interval = TaggedInterval(len(positions) + 1)
             elif c == ')':
+                positions.append(
+                    (current_aa, current_tag() if current_tag else None))
+                current_aa = None
                 if current_interval is None:
                     raise ProFormaError("Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state)
                 else:
-                    current_interval.end = len(positions) + 1
+                    current_interval.end = len(positions)
                     if i >= n or sequence[i] != '[':
                         raise ProFormaError("Missing Interval Tag", i, state)
                     i += 1
@@ -1085,7 +1190,7 @@ def parse_proforma(sequence):
             elif c == '-':
                 state = TAG_AFTER
                 if i >= n or sequence[i] != '[':
-                    raise ProFormaError("Missing Interval Tag", i, state)
+                    raise ProFormaError("Missing Closing Tag", i, state)
                 i += 1
                 depth = 1
             else:
@@ -1102,7 +1207,7 @@ def parse_proforma(sequence):
                     elif state == TAG_BEFORE:
                         state = POST_TAG_BEFORE
                     elif state == TAG_AFTER:
-                        c_term = current_tag.process()
+                        c_term = current_tag()
                         state = DONE
                     elif state == GLOBAL:
                         state = POST_GLOBAL
@@ -1130,7 +1235,7 @@ def parse_proforma(sequence):
                 depth -= 1
                 if depth <= 0:
                     depth = 0
-                    labile_modifications.append(current_tag.process())
+                    labile_modifications.append(current_tag()[0])
                     state = BEFORE
             else:
                 current_tag.append(c)
@@ -1140,19 +1245,32 @@ def parse_proforma(sequence):
             elif c == ']':
                 depth -= 1
                 if depth <= 0:
+                    state = POST_INTERVAL_TAG
                     depth = 0
-                    current_interval.tag = current_tag.process()
-                    intervals.append(current_interval)
-                    current_interval = None
-                    state = SEQ
             else:
                 current_tag.append(c)
+        elif state == POST_INTERVAL_TAG:
+            if c == '[':
+                current_tag.bound()
+                state = INTERVAL_TAG
+            elif c in VALID_AA:
+                current_aa = c
+                current_interval.tag = current_tag()
+                intervals.append(current_interval)
+                current_interval = None
+                state = SEQ
+            elif c == '-':
+                state = TAG_AFTER
+                if i >= n or sequence[i] != '[':
+                    raise ProFormaError("Missing Closing Tag", i, state)
+                i += 1
+                depth = 1
         elif state == POST_TAG_BEFORE:
             if c == '?':
-                unlocalized_modifications.append(current_tag.process())
+                unlocalized_modifications.append(current_tag()[0])
                 state = BEFORE
             elif c == '-':
-                n_term = current_tag.process()
+                n_term = current_tag()
                 state = BEFORE
             elif c == '^':
                 state = UNLOCALIZED_COUNT
@@ -1165,14 +1283,14 @@ def parse_proforma(sequence):
             elif c == '[':
                 state = TAG_BEFORE
                 depth = 1
-                tag = current_tag.process()
-                multiplicity = current_unlocalized_count.process()
+                tag = current_tag()[0]
+                multiplicity = current_unlocalized_count()
                 for i in range(multiplicity):
                     unlocalized_modifications.append(tag)
             elif c == '?':
                 state = BEFORE
-                tag = current_tag.process()
-                multiplicity = current_unlocalized_count.process()
+                tag = current_tag()[0]
+                multiplicity = current_unlocalized_count()
                 for i in range(multiplicity):
                     unlocalized_modifications.append(tag)
             else:
@@ -1190,7 +1308,7 @@ def parse_proforma(sequence):
                 current_aa_targets.append(c)
             elif c == '>':
                 fixed_modifications.append(
-                    ModificationRule(current_tag.process(), current_aa_targets.process()))
+                    ModificationRule(current_tag()[0], current_aa_targets()))
                 state = BEFORE
             else:
                 raise ProFormaError(
@@ -1198,7 +1316,7 @@ def parse_proforma(sequence):
         else:
             raise ProFormaError("Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state)
     if current_aa:
-        positions.append((current_aa, current_tag.process() if current_tag else None))
+        positions.append((current_aa, current_tag() if current_tag else None))
     if state in (ISOTOPE, TAG, TAG_AFTER, TAG_BEFORE, LABILE, ):
         raise ProFormaError("Error In State {state}, unclosed group reached end of string!".format(**locals()), i, state)
     return positions, {
@@ -1244,15 +1362,22 @@ def to_proforma(sequence, n_term=None, c_term=None, unlocalized_modifications=No
     -------
     str
     '''
-    primary = deque(['{0!s}[{1!s}]'.format(*p) if p[1] else p[0] for p in sequence])
+    primary = deque()
+    for aa, tags in sequence:
+        if not tags:
+            primary.append(str(aa))
+        else:
+            primary.append(str(aa) + ''.join(['[{0!s}]'.format(t) for t in tags]))
     if intervals:
         for iv in sorted(intervals, key=lambda x: x.start):
             primary[iv.start] = '(' + primary[iv.start]
-            primary[iv.end - 1] = '{0!s})[{1!s}]'.format(primary[iv.end - 1], iv.tag)
+
+            primary[iv.end - 1] = '{0!s})'.format(
+                primary[iv.end - 1]) + ''.join('[{!s}]'.format(t) for t in iv.tag)
     if n_term:
-        primary.appendleft("[{!s}]-".format(n_term))
+        primary.appendleft(''.join("[{!s}]".format(t) for t in n_term) + '-')
     if c_term:
-        primary.append('-[{!s}]'.format(c_term))
+        primary.append('-' + ''.join("[{!s}]".format(t) for t in c_term))
     if labile_modifications:
         primary.extendleft(['{{{!s}}}'.format(m) for m in labile_modifications])
     if unlocalized_modifications:
@@ -1278,10 +1403,23 @@ def __repr__(self):
 
     def __getitem__(self, i):
         if isinstance(i, slice):
+            props = self.properties.copy()
+
             return self.__class__(self.sequence[i], self.properties)
         else:
             return self.sequence[i]
 
+    def __eq__(self, other):
+        if isinstance(other, str):
+            return str(self) == other
+        elif other is None:
+            return False
+        else:
+            return self.sequence == other.sequence and self.properties == other.properties
+
+    def __ne__(self, other):
+        return not self == other
+
     @classmethod
     def parse(cls, string):
         return cls(*parse_proforma(string))
@@ -1301,16 +1439,29 @@ def mass(self):
             mass += std_aa_mass[aa]
             if aa in fixed_rules:
                 mass += fixed_rules[aa]
-            tag = position[1]
-            if tag:
-                try:
-                    mass += tag.mass
-                except (AttributeError, KeyError):
-                    continue
+            tags = position[1]
+            if tags:
+                for tag in tags:
+                    try:
+                        mass += tag.mass
+                    except (AttributeError, KeyError):
+                        continue
         for mod in self.properties['labile_modifications']:
             mass += mod.mass
         for mod in self.properties['unlocalized_modifications']:
             mass += mod.mass
+        if self.properties.get('n_term'):
+            for mod in self.properties['n_term']:
+                try:
+                    mass += mod.mass
+                except (AttributeError, KeyError):
+                    continue
+        if self.properties.get('c_term'):
+            for mod in self.properties['c_term']:
+                try:
+                    mass += mod.mass
+                except (AttributeError, KeyError):
+                    continue
         for iv in self.properties['intervals']:
             try:
                 mass += iv.tag.mass
@@ -1318,3 +1469,29 @@ def mass(self):
                 continue
         return mass
 
+    def find_tags_by_id(self, tag_id, include_position=True):
+        if not tag_id.startswith("#"):
+            tag_id = "#" + tag_id
+        if tag_id not in self.properties['group_ids']:
+            return []
+        matches = []
+        for i, (_token, tags) in enumerate(self.sequence):
+            if tags:
+                for tag in tags:
+                    if tag.group_id == tag_id:
+                        if include_position:
+                            matches.append((i, tag))
+                        else:
+                            matches.append(tag)
+        for iv in self.properties['intervals']:
+            if iv.tag.group_id == tag_id:
+                matches.append((iv, iv.tag) if include_position else iv.tag)
+        for ulmod in self.properties['unlocalized_modifications']:
+            if ulmod.group_id == tag_id:
+                matches.append(('unlocalized_modifications', ulmod)
+                               if include_position else ulmod)
+        for lamod in self.properties['labile_modifications']:
+            if lamod.group_id == tag_id:
+                matches.append(('labile_modifications', lamod)
+                               if include_position else lamod)
+        return matches
diff --git a/tests/test_proforma.py b/tests/test_proforma.py
new file mode 100644
index 00000000..b488c3cb
--- /dev/null
+++ b/tests/test_proforma.py
@@ -0,0 +1,65 @@
+
+from ast import parse
+from os import path
+import unittest
+import pyteomics
+pyteomics.__path__ = [path.abspath(
+    path.join(path.dirname(__file__), path.pardir, 'pyteomics'))]
+from pyteomics import proforma
+from pyteomics.proforma import (
+    ProForma, TaggedInterval, parse_proforma, MassModification,
+    ModificationRule, StableIsotope, GenericModification, to_proforma,
+    )
+
+
+class ProFormaTest(unittest.TestCase):
+    maxDiff = None
+
+    def test_complicated_short(self):
+        complicated_short = r"<[Carbamidomethyl]@C><13C>[Hydroxylation]?{HexNAc}[Hex]-ST[UNIMOD:Oxidation](EPP)[+18.15]ING"
+        tokens, properties = parse_proforma(complicated_short)
+        assert len(tokens) == 8
+        assert len(properties['n_term']) == 1
+        assert properties['n_term'][0] == 'Hex'
+        assert len(properties['intervals']) == 1
+        assert properties['intervals'][0] == TaggedInterval(2, 5, [MassModification(18.15)])
+        assert len(properties['isotopes']) == 1
+        assert properties['isotopes'][0] == StableIsotope("13C")
+        assert properties['fixed_modifications'][0] == ModificationRule(
+            GenericModification('Carbamidomethyl', None, None), ['C'])
+        assert to_proforma(tokens, **properties) == complicated_short
+        self.assertAlmostEqual(ProForma(tokens, properties).mass, 1192.498297, 3)
+
+
+    def test_ranges(self):
+        seq = "PRQT(EQC[Carbamidomethyl]FQRMS)[+19.0523]ISK"
+        parsed = proforma.ProForma.parse(seq)
+        assert str(parsed) == seq
+
+    def test_error_on_nested_range(self):
+        self.assertRaises(proforma.ProFormaError, lambda: parse_proforma(
+            "PRQT(EQ(CFQR)[Carbamidomethyl]MS)[+19.0523]ISK"))
+
+    def test_localization_scores(self):
+        seq = "EM[Oxidation]EVT[#g1(0.01)]S[#g1(0.09)]ES[Phospho#g1(0.90)]PEK"
+        obj = ProForma.parse(seq)
+        tags = obj.find_tags_by_id("#g1")
+        solutions = {4: 0.01, 5: 0.09, 7: 0.9}
+        for i, tag in tags:
+            marker = tag.find_tag_type(proforma.TagTypeEnum.localization_marker)[0]
+            expected = solutions[i]
+            assert expected == marker.value
+
+    def test_multiple_info(self):
+        i = proforma.ProForma.parse(
+            "ELVIS[Phospho|INFO:newly discovered|info:really awesome]K")
+        tags = i[4][1][0].find_tag_type(proforma.TagTypeEnum.info)
+        messages = set(['newly discovered', 'really awesome'])
+        assert len(tags) == 2
+        for tag in tags:
+            messages.remove(tag.value)
+        assert len(messages) == 0
+
+
+if __name__ == '__main__':
+    unittest.main()

From 35b46588569e07868a8f5ec41d3efb09fb4ecc3c Mon Sep 17 00:00:00 2001
From: Joshua Klein <mobiusklein@gmail.com>
Date: Sun, 23 May 2021 23:14:34 -0400
Subject: [PATCH 11/27] No f-strings

---
 pyteomics/proforma.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyteomics/proforma.py b/pyteomics/proforma.py
index 49df3ed0..89cbc187 100644
--- a/pyteomics/proforma.py
+++ b/pyteomics/proforma.py
@@ -602,7 +602,7 @@ def resolve(self):
             else:
                 cnt = 1
             if tok not in self.valid_monosaccharides:
-                raise ValueError(f"{tok!r} is not a valid monosaccharide name")
+                raise ValueError("{tok!r} is not a valid monosaccharide name".format(**locals()))
             composite[tok] += cnt
         mass = 0
         chemcomp = Composition()

From feda7d0ff4b38306e1586d832c7c51e9e3f588db Mon Sep 17 00:00:00 2001
From: Joshua Klein <mobiusklein@gmail.com>
Date: Sun, 23 May 2021 23:17:47 -0400
Subject: [PATCH 12/27] Use explicit super

---
 pyteomics/proforma.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyteomics/proforma.py b/pyteomics/proforma.py
index 89cbc187..291e9c3f 100644
--- a/pyteomics/proforma.py
+++ b/pyteomics/proforma.py
@@ -1053,7 +1053,7 @@ def _transform(self, value):
         return tag
 
     def process(self):
-        value = super().process()
+        value = super(TagParser, self).process()
         if not isinstance(value, list):
             value = [value]
         return value

From 8be8fc597ed469b461784ae14ef83d692a06b2d9 Mon Sep 17 00:00:00 2001
From: Joshua Klein <mobiusklein@gmail.com>
Date: Sun, 23 May 2021 23:23:51 -0400
Subject: [PATCH 13/27] Add unknown amino acid

---
 pyteomics/proforma.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyteomics/proforma.py b/pyteomics/proforma.py
index 291e9c3f..2cbf6f85 100644
--- a/pyteomics/proforma.py
+++ b/pyteomics/proforma.py
@@ -1095,7 +1095,7 @@ class ParserStateEnum(Enum):
 POST_INTERVAL_TAG = ParserStateEnum.post_interval_tag
 DONE = ParserStateEnum.done
 
-VALID_AA = set("QWERTYIPASDFGHKLCVNM")
+VALID_AA = set("QWERTYIPASDFGHKLCVNMX")
 
 def parse_proforma(sequence):
     '''Tokenize a ProForma sequence into a sequence of amino acid+tag positions, and a

From 5508775aa222c4163128e15462495dfb9d4b876a Mon Sep 17 00:00:00 2001
From: Joshua Klein <mobiusklein@gmail.com>
Date: Thu, 27 May 2021 11:53:26 -0400
Subject: [PATCH 14/27] Fix terminal masses

---
 pyteomics/proforma.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pyteomics/proforma.py b/pyteomics/proforma.py
index 2cbf6f85..fc55fb11 100644
--- a/pyteomics/proforma.py
+++ b/pyteomics/proforma.py
@@ -46,6 +46,7 @@
 coupled with minimal information about mass and position data.
 '''
 
+from pyteomics.mass.mass import calculate_mass
 import re
 import warnings
 from collections import namedtuple, defaultdict, deque
@@ -1456,12 +1457,15 @@ def mass(self):
                     mass += mod.mass
                 except (AttributeError, KeyError):
                     continue
+        mass += calculate_mass(formula="H")
         if self.properties.get('c_term'):
             for mod in self.properties['c_term']:
                 try:
                     mass += mod.mass
                 except (AttributeError, KeyError):
                     continue
+
+        mass += calculate_mass(formula="OH")
         for iv in self.properties['intervals']:
             try:
                 mass += iv.tag.mass

From 7caff0a2f2ce600867c0bf9abb008486abe41778 Mon Sep 17 00:00:00 2001
From: Joshua Klein <mobiusklein@gmail.com>
Date: Thu, 27 May 2021 12:24:14 -0400
Subject: [PATCH 15/27] update test

---
 tests/test_proforma.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/test_proforma.py b/tests/test_proforma.py
index b488c3cb..ed24eca3 100644
--- a/tests/test_proforma.py
+++ b/tests/test_proforma.py
@@ -28,7 +28,8 @@ def test_complicated_short(self):
         assert properties['fixed_modifications'][0] == ModificationRule(
             GenericModification('Carbamidomethyl', None, None), ['C'])
         assert to_proforma(tokens, **properties) == complicated_short
-        self.assertAlmostEqual(ProForma(tokens, properties).mass, 1192.498297, 3)
+        self.assertAlmostEqual(
+            ProForma(tokens, properties).mass, 1210.5088, 3)
 
 
     def test_ranges(self):

From 2b9402b301adecf5065b5eff16f70544339cb6bb Mon Sep 17 00:00:00 2001
From: Joshua Klein <mobiusklein@gmail.com>
Date: Mon, 31 May 2021 12:14:46 -0400
Subject: [PATCH 16/27] Fully support all the required additional amino acids

---
 pyteomics/mass/mass.py | 3 +++
 pyteomics/proforma.py  | 9 ++++++---
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/pyteomics/mass/mass.py b/pyteomics/mass/mass.py
index d238b289..2348794e 100644
--- a/pyteomics/mass/mass.py
+++ b/pyteomics/mass/mass.py
@@ -394,6 +394,7 @@ def mass(self, **kwargs):
     'G':   Composition({'H': 3, 'C': 2, 'O': 1, 'N': 1}),
     'H':   Composition({'H': 7, 'C': 6, 'N': 3, 'O': 1}),
     'I':   Composition({'H': 11, 'C': 6, 'O': 1, 'N': 1}),
+    'J':   Composition({'H': 11, 'C': 6, 'O': 1, 'N': 1}),
     'K':   Composition({'H': 12, 'C': 6, 'N': 2, 'O': 1}),
     'L':   Composition({'H': 11, 'C': 6, 'O': 1, 'N': 1}),
     'M':   Composition({'H': 9, 'C': 5, 'S': 1, 'O': 1, 'N': 1}),
@@ -726,6 +727,7 @@ def isotopologues(*args, **kwargs):
     'C': 103.00919,
     'L': 113.08406,
     'I': 113.08406,
+    'J': 113.08406,
     'N': 114.04293,
     'D': 115.02694,
     'Q': 128.05858,
@@ -739,6 +741,7 @@ def isotopologues(*args, **kwargs):
     'Y': 163.06333,
     'W': 186.07931,
     'O': 237.14773,
+    'U': 168.065,
 }
 """A dictionary with monoisotopic masses of the twenty standard
 amino acid residues, selenocysteine and pyrrolysine.
diff --git a/pyteomics/proforma.py b/pyteomics/proforma.py
index fc55fb11..264273a3 100644
--- a/pyteomics/proforma.py
+++ b/pyteomics/proforma.py
@@ -69,7 +69,7 @@
     from psims.controlled_vocabulary.controlled_vocabulary import (load_psimod, load_xlmod, load_gno, obo_cache)
 except ImportError:
     def _needs_psims(name):
-        raise ImportError("Loading %s requires the `psims` library. To access it, please install `psims" % name)
+        raise ImportError("Loading %s requires the `psims` library. To access it, please install `psims`" % name)
 
     load_psimod = partial(_needs_psims, 'PSIMOD')
     load_xlmod = partial(_needs_psims, 'XLMOD')
@@ -1096,7 +1096,7 @@ class ParserStateEnum(Enum):
 POST_INTERVAL_TAG = ParserStateEnum.post_interval_tag
 DONE = ParserStateEnum.done
 
-VALID_AA = set("QWERTYIPASDFGHKLCVNMX")
+VALID_AA = set("QWERTYIPASDFGHKLCVNMXUOJZB")
 
 def parse_proforma(sequence):
     '''Tokenize a ProForma sequence into a sequence of amino acid+tag positions, and a
@@ -1437,7 +1437,10 @@ def mass(self):
 
         for position in self.sequence:
             aa = position[0]
-            mass += std_aa_mass[aa]
+            try:
+                mass += std_aa_mass[aa]
+            except KeyError:
+                warnings.warn("%r does not have an exact mass" % (aa, ))
             if aa in fixed_rules:
                 mass += fixed_rules[aa]
             tags = position[1]

From 5f5166e6e8d67244d1cfbdbe536ed06fad066414 Mon Sep 17 00:00:00 2001
From: Joshua Klein <mobiusklein@gmail.com>
Date: Mon, 31 May 2021 15:21:19 -0400
Subject: [PATCH 17/27] Remove duplicated undehydrated selenocysteine mass

---
 pyteomics/mass/mass.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pyteomics/mass/mass.py b/pyteomics/mass/mass.py
index 2348794e..cb0777c3 100644
--- a/pyteomics/mass/mass.py
+++ b/pyteomics/mass/mass.py
@@ -741,7 +741,6 @@ def isotopologues(*args, **kwargs):
     'Y': 163.06333,
     'W': 186.07931,
     'O': 237.14773,
-    'U': 168.065,
 }
 """A dictionary with monoisotopic masses of the twenty standard
 amino acid residues, selenocysteine and pyrrolysine.

From 5937299bdf9c4602d732e1768195f6b6f661ac39 Mon Sep 17 00:00:00 2001
From: Joshua Klein <mobiusklein@gmail.com>
Date: Mon, 31 May 2021 16:05:22 -0400
Subject: [PATCH 18/27] Properly handle nested braces and isotopes

---
 pyteomics/proforma.py  | 32 +++++++++++++++++++++++++++++---
 tests/test_proforma.py |  5 +++++
 2 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/pyteomics/proforma.py b/pyteomics/proforma.py
index 264273a3..f49bfd9f 100644
--- a/pyteomics/proforma.py
+++ b/pyteomics/proforma.py
@@ -558,12 +558,32 @@ def resolve(self):
 class FormulaModification(ModificationBase):
     prefix_name = "Formula"
 
+    isotope_pattern = re.compile(r'\[(?P<isotope>\d+)(?P<element>[A-Z][a-z]*)(?P<quantity>[\-+]?\d+)\]')
     _tag_type = TagTypeEnum.formula
 
+    def _normalize_isotope_notation(self, match):
+        '''Rewrite ProForma isotope notation to Pyteomics-compatible
+        isotope notation.
+
+        Parameters
+        ----------
+        match : Match
+            The matched isotope notation string parsed by the regular expression.
+
+        Returns
+        reformatted : str
+            The re-written isotope notation
+        '''
+        parts = match.groupdict()
+        return "{element}[{isotope}]{quantity}".format(**parts)
+
     def resolve(self):
-        # The handling of fixed isotopes is wrong here as Pyteomics uses a different
-        # convention.
-        composition = Composition(formula=''.join(self.value.split(" ")))
+        normalized = ''.join(self.value.split(" "))
+        # If there is a [ character in the formula, we know there are isotopes which
+        # need to be normalized.
+        if '[' in normalized:
+            normalized = self.isotope_pattern.sub(self._normalize_isotope_notation, normalized)
+        composition = Composition(formula=normalized)
         return {
             "mass": composition.mass(),
             "composition": composition,
@@ -1199,6 +1219,7 @@ def parse_proforma(sequence):
         elif state == TAG or state == TAG_BEFORE or state == TAG_AFTER or state == GLOBAL:
             if c == '[':
                 depth += 1
+                current_tag.append(c)
             elif c == ']':
                 depth -= 1
                 if depth <= 0:
@@ -1212,6 +1233,8 @@ def parse_proforma(sequence):
                         state = DONE
                     elif state == GLOBAL:
                         state = POST_GLOBAL
+                else:
+                    current_tag.append(c)
             else:
                 current_tag.append(c)
         elif state == FIXED:
@@ -1243,11 +1266,14 @@ def parse_proforma(sequence):
         elif state == INTERVAL_TAG:
             if c == '[':
                 depth += 1
+                current_tag.append(c)
             elif c == ']':
                 depth -= 1
                 if depth <= 0:
                     state = POST_INTERVAL_TAG
                     depth = 0
+                else:
+                    current_tag.append(c)
             else:
                 current_tag.append(c)
         elif state == POST_INTERVAL_TAG:
diff --git a/tests/test_proforma.py b/tests/test_proforma.py
index ed24eca3..36b9ea7f 100644
--- a/tests/test_proforma.py
+++ b/tests/test_proforma.py
@@ -61,6 +61,11 @@ def test_multiple_info(self):
             messages.remove(tag.value)
         assert len(messages) == 0
 
+    def test_formula(self):
+        i = proforma.ProForma.parse("SEQUEN[Formula:[13C2]CH6N]CE")
+        mod = i[-3][1][0]
+        assert mod.composition == proforma.Composition(
+            {'H': 6, 'C[13]': 2, 'C': 1, 'N': 1})
 
 if __name__ == '__main__':
     unittest.main()

From 53c330af9a37b792211bf27eb626abe31ce93ca3 Mon Sep 17 00:00:00 2001
From: Joshua Klein <mobiusklein@gmail.com>
Date: Mon, 31 May 2021 16:07:02 -0400
Subject: [PATCH 19/27] Update pyteomics/proforma.py

Co-authored-by: Lev Levitsky <lev.levitsky@phystech.edu>
---
 pyteomics/proforma.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/pyteomics/proforma.py b/pyteomics/proforma.py
index f49bfd9f..2402aae6 100644
--- a/pyteomics/proforma.py
+++ b/pyteomics/proforma.py
@@ -44,6 +44,12 @@
 
 :py:class:`ProForma` - An object oriented version of the parsing and formatting code,
 coupled with minimal information about mass and position data.
+
+Dependencies
+------------
+
+To resolve PSI-MOD, XL-MOD, and GNO identifiers, :mod:`psims` is required.
+
 '''
 
 from pyteomics.mass.mass import calculate_mass

From e77ca1a8e81dc4add36eda5dac0beb10c47e8c38 Mon Sep 17 00:00:00 2001
From: Joshua Klein <mobiusklein@gmail.com>
Date: Mon, 31 May 2021 16:12:03 -0400
Subject: [PATCH 20/27] Update pyteomics/proforma.py

Co-authored-by: Lev Levitsky <lev.levitsky@phystech.edu>
---
 pyteomics/proforma.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyteomics/proforma.py b/pyteomics/proforma.py
index 2402aae6..59357839 100644
--- a/pyteomics/proforma.py
+++ b/pyteomics/proforma.py
@@ -402,7 +402,7 @@ def __init__(self, **kwargs):
         self._database = kwargs.get("database")
 
     def load_database(self):
-        return load_psimod()
+        return load_xlmod()
 
     def resolve(self, name=None, id=None, **kwargs):
         if name is not None:

From 25fde39864404aba2b1471df1faa9fb4e3031f38 Mon Sep 17 00:00:00 2001
From: Joshua Klein <mobiusklein@gmail.com>
Date: Mon, 31 May 2021 17:20:40 -0400
Subject: [PATCH 21/27] Add compliance level to documentation

---
 pyteomics/proforma.py | 44 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/pyteomics/proforma.py b/pyteomics/proforma.py
index 59357839..63185692 100644
--- a/pyteomics/proforma.py
+++ b/pyteomics/proforma.py
@@ -50,6 +50,50 @@
 
 To resolve PSI-MOD, XL-MOD, and GNO identifiers, :mod:`psims` is required.
 
+
+Compliance Levels
+-----------------
+
+1. Base Level Support
+Represents the lowest level of compliance, this level involves providing support for:
+
+    - [x] Amino acid sequences
+    - [x] Protein modifications using two of the supported CVs/ontologies: Unimod and PSI-MOD.
+    - [x] Protein modifications using delta masses (without prefixes)
+    - [x] N-terminal, C-terminal and labile modifications.
+    - [x] Ambiguity in the modification position, including support for localisation scores.
+    - [x] INFO tag.
+
+2. Additional Separate Support
+These features are independent from each other:
+
+    - [x] Unusual amino acids (O and U).
+    - [x] Ambiguous amino acids (e.g. X, B, Z). This would include support for sequence tags of known mass (using the character X).
+    - [x] Protein modifications using delta masses (using prefixes for the different CVs/ontologies).
+    - [x] Use of prefixes for Unimod (U:) and PSI-MOD (M:) names.
+    - [x] Support for the joint representation of experimental data and its interpretation.
+
+3. Top Down Extensions
+
+    - [ ] Additional CV/ontologies for protein modifications: RESID (the prefix R MUST be used for RESID CV/ontology term names)
+    - [x] Chemical formulas (this feature occurs in two places in this list).
+
+4. Cross-Linking Extensions
+
+    - [ ]  Cross-linked peptides (using the XL-MOD CV/ontology, the prefix X MUST be used for XL-MOD CV/ontology term names).
+
+5. Glycan Extensions
+
+    - [x] Additional CV/ontologies for protein modifications: GNO (the prefix G MUST be used for GNO CV/ontology term names)
+    - [x] Glycan composition.
+    - [x] Chemical formulas (this feature occurs in two places in this list).
+
+6. Spectral Support
+
+    - [ ] Charge and chimeric spectra are special cases.
+    - [x] Global modifications (e.g., every C is C13).
+
+
 '''
 
 from pyteomics.mass.mass import calculate_mass

From da164db1c9877c85e10b4ae8b44cdf30256c6035 Mon Sep 17 00:00:00 2001
From: Joshua Klein <mobiusklein@gmail.com>
Date: Wed, 2 Jun 2021 20:54:36 -0400
Subject: [PATCH 22/27] Fix up glycan mass calculation

---
 .github/workflows/pythonpackage.yml |  2 +-
 pyteomics/proforma.py               | 37 ++++++++++++++++++++---------
 tests/test_proforma.py              | 17 +++++++++----
 3 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml
index 5ca06e87..a79d2011 100644
--- a/.github/workflows/pythonpackage.yml
+++ b/.github/workflows/pythonpackage.yml
@@ -26,7 +26,7 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         pip install numpy
-        pip install lxml sqlalchemy pandas cython h5py hdf5plugin
+        pip install lxml sqlalchemy pandas cython h5py hdf5plugin psims
         pip install pynumpress
     - name: Run the tests
       run: |
diff --git a/pyteomics/proforma.py b/pyteomics/proforma.py
index 63185692..5c733037 100644
--- a/pyteomics/proforma.py
+++ b/pyteomics/proforma.py
@@ -12,9 +12,9 @@
 Data Access
 -----------
 
-:py:func:`parse_proforma` - The primary interface for parsing ProForma strings.
+:py:func:`parse` - The primary interface for parsing ProForma strings.
 
-    >>> parse_proforma("EM[Oxidation]EVT[#g1(0.01)]S[#g1(0.09)]ES[Phospho#g1(0.90)]PEK")
+    >>> parse("EM[Oxidation]EVT[#g1(0.01)]S[#g1(0.09)]ES[Phospho#g1(0.90)]PEK")
         ([('E', None),
           ('M', GenericModification('Oxidation', None, None)),
           ('E', None),
@@ -99,7 +99,7 @@
 from pyteomics.mass.mass import calculate_mass
 import re
 import warnings
-from collections import namedtuple, defaultdict, deque
+from collections import deque
 from functools import partial
 
 try:
@@ -108,13 +108,10 @@
     # Python 2 doesn't have a builtin Enum type
     Enum = object
 
-
-from pyteomics import parser
 from pyteomics.mass import Composition, std_aa_mass, Unimod
 from pyteomics.auxiliary import PyteomicsError, BasicComposition
 from pyteomics.auxiliary.utils import add_metaclass
 
-# To eventually be implemented with pyteomics port?
 try:
     from psims.controlled_vocabulary.controlled_vocabulary import (load_psimod, load_xlmod, load_gno, obo_cache)
 except ImportError:
@@ -126,6 +123,7 @@ def _needs_psims(name):
     load_gno = partial(_needs_psims, 'GNO')
     obo_cache = None
 
+_water_mass = calculate_mass("H2O")
 
 std_aa_mass = std_aa_mass.copy()
 std_aa_mass['X'] = 0
@@ -481,13 +479,24 @@ def __init__(self, **kwargs):
     def load_database(self):
         return load_gno()
 
+    def get_mass_from_glycan_composition(self, term):
+        val = term.get('GNO:00000202')
+        if val:
+            tokens = re.findall(r"([A-Za-z0-9]+)\((\d+)\)", val)
+            mass = 0.0
+            for symbol, count in tokens:
+                mass += GlycanModification.valid_monosaccharides[symbol][0] * int(count)
+            return mass
+        return None
+
     def get_mass_from_term(self, term):
+        raw_mass = self.get_mass_from_glycan_composition(term)
         root_id = 'GNO:00000001'
         parent = term.parent()
         if isinstance(parent, list):
             parent = parent[0]
         while parent.id != root_id:
-            next_parent = term.parent()
+            next_parent = parent.parent()
             if isinstance(next_parent, list):
                 next_parent = next_parent[0]
             if next_parent.id == root_id:
@@ -496,7 +505,11 @@ def get_mass_from_term(self, term):
         match = self.mass_pattern.search(parent.name)
         if not match:
             return None
-        return float(match.group(1))
+        # This will have a small mass error.
+        rough_mass = float(match.group(1)) - _water_mass
+        if abs(rough_mass - raw_mass) < 1:
+            return raw_mass
+        return rough_mass
 
     def resolve(self, name=None, id=None, **kwargs):
         if name is not None:
@@ -512,6 +525,7 @@ def resolve(self, name=None, id=None, **kwargs):
             "composition": None,
             "mass": self.get_mass_from_term(term)
         }
+        return rec
 
 
 class GenericResolver(ModificationResolver):
@@ -628,7 +642,7 @@ def _normalize_isotope_notation(self, match):
         return "{element}[{isotope}]{quantity}".format(**parts)
 
     def resolve(self):
-        normalized = ''.join(self.value.split(" "))
+        normalized = self.value.replace(' ', '')
         # If there is a [ character in the formula, we know there are isotopes which
         # need to be normalized.
         if '[' in normalized:
@@ -656,6 +670,7 @@ class GlycanModification(ModificationBase):
         "NeuAc": (291.0954, Composition("C11H17N1O8")),
         "NeuGc": (307.0903, Composition("C11H17N1O9")),
         "Pen": (132.0422, Composition("C5H8O4")),
+        "Pent": (132.0422, Composition("C5H8O4")),
         "Fuc": (146.0579, Composition("C6H10O4"))
     }
 
@@ -1168,7 +1183,7 @@ class ParserStateEnum(Enum):
 
 VALID_AA = set("QWERTYIPASDFGHKLCVNMXUOJZB")
 
-def parse_proforma(sequence):
+def parse(sequence):
     '''Tokenize a ProForma sequence into a sequence of amino acid+tag positions, and a
     mapping of sequence-spanning modifiers.
 
@@ -1499,7 +1514,7 @@ def __ne__(self, other):
 
     @classmethod
     def parse(cls, string):
-        return cls(*parse_proforma(string))
+        return cls(*parse(string))
 
     @property
     def mass(self):
diff --git a/tests/test_proforma.py b/tests/test_proforma.py
index 36b9ea7f..b19673ad 100644
--- a/tests/test_proforma.py
+++ b/tests/test_proforma.py
@@ -7,9 +7,9 @@
     path.join(path.dirname(__file__), path.pardir, 'pyteomics'))]
 from pyteomics import proforma
 from pyteomics.proforma import (
-    ProForma, TaggedInterval, parse_proforma, MassModification,
+    ProForma, TaggedInterval, parse, MassModification,
     ModificationRule, StableIsotope, GenericModification, to_proforma,
-    )
+    obo_cache)
 
 
 class ProFormaTest(unittest.TestCase):
@@ -17,7 +17,7 @@ class ProFormaTest(unittest.TestCase):
 
     def test_complicated_short(self):
         complicated_short = r"<[Carbamidomethyl]@C><13C>[Hydroxylation]?{HexNAc}[Hex]-ST[UNIMOD:Oxidation](EPP)[+18.15]ING"
-        tokens, properties = parse_proforma(complicated_short)
+        tokens, properties = parse(complicated_short)
         assert len(tokens) == 8
         assert len(properties['n_term']) == 1
         assert properties['n_term'][0] == 'Hex'
@@ -38,7 +38,7 @@ def test_ranges(self):
         assert str(parsed) == seq
 
     def test_error_on_nested_range(self):
-        self.assertRaises(proforma.ProFormaError, lambda: parse_proforma(
+        self.assertRaises(proforma.ProFormaError, lambda: parse(
             "PRQT(EQ(CFQR)[Carbamidomethyl]MS)[+19.0523]ISK"))
 
     def test_localization_scores(self):
@@ -67,5 +67,14 @@ def test_formula(self):
         assert mod.composition == proforma.Composition(
             {'H': 6, 'C[13]': 2, 'C': 1, 'N': 1})
 
+    def test_gnome(self):
+        gp = proforma.ProForma.parse("NEEYN[GNO:G59626AS]K")
+        self.assertAlmostEqual(gp.mass, 2709.016, 3)
+
+    def test_glycan(self):
+        gp = proforma.ProForma.parse("NEEYN[Glycan:Hex5HexNAc4NeuAc1]K")
+        self.assertAlmostEqual(gp.mass, 2709.016, 3)
+
+
 if __name__ == '__main__':
     unittest.main()

From 803120759da9a2947f8f5cb1bcf23bc186c7c879 Mon Sep 17 00:00:00 2001
From: Joshua Klein <mobiusklein@gmail.com>
Date: Wed, 2 Jun 2021 21:25:31 -0400
Subject: [PATCH 23/27] Fix slice behavior

---
 pyteomics/proforma.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pyteomics/proforma.py b/pyteomics/proforma.py
index 5c733037..8e9566a2 100644
--- a/pyteomics/proforma.py
+++ b/pyteomics/proforma.py
@@ -1496,8 +1496,7 @@ def __repr__(self):
     def __getitem__(self, i):
         if isinstance(i, slice):
             props = self.properties.copy()
-
-            return self.__class__(self.sequence[i], self.properties)
+            return self.__class__(self.sequence[i], props)
         else:
             return self.sequence[i]
 

From 293b050d4899f651beb7d7cf1f3f25b4e272feba Mon Sep 17 00:00:00 2001
From: Joshua Klein <mobiusklein@gmail.com>
Date: Sun, 6 Jun 2021 20:48:07 -0400
Subject: [PATCH 24/27] Simplify, more documentation

---
 pyteomics/proforma.py | 383 ++++++++++++++++++++++++++++++++++++------
 1 file changed, 331 insertions(+), 52 deletions(-)

diff --git a/pyteomics/proforma.py b/pyteomics/proforma.py
index 8e9566a2..d88c3510 100644
--- a/pyteomics/proforma.py
+++ b/pyteomics/proforma.py
@@ -16,14 +16,14 @@
 
     >>> parse("EM[Oxidation]EVT[#g1(0.01)]S[#g1(0.09)]ES[Phospho#g1(0.90)]PEK")
         ([('E', None),
-          ('M', GenericModification('Oxidation', None, None)),
+          ('M', [GenericModification('Oxidation', None, None)]),
           ('E', None),
           ('V', None),
-          ('T', LocalizationMarker(0.01, None, '#g1')),
-          ('S', LocalizationMarker(0.09, None, '#g1')),
+          ('T', [LocalizationMarker(0.01, None, '#g1')]),
+          ('S', [LocalizationMarker(0.09, None, '#g1')]),
           ('E', None),
           ('S',
-          GenericModification('Phospho', [LocalizationMarker(0.9, None, '#g1')], '#g1')),
+          [GenericModification('Phospho', [LocalizationMarker(0.9, None, '#g1')], '#g1')]),
           ('P', None),
           ('E', None),
           ('K', None)],
@@ -48,7 +48,10 @@
 Dependencies
 ------------
 
-To resolve PSI-MOD, XL-MOD, and GNO identifiers, :mod:`psims` is required.
+To resolve PSI-MOD, XL-MOD, and GNO identifiers, :mod:`psims` is required. By default,
+:mod:`psims` retrieves the most recent version of each ontology from the internet, but
+includes a fall-back version to use when the network is unavailable. It can also create
+an application cache on disk
 
 
 Compliance Levels
@@ -90,7 +93,8 @@
 
 6. Spectral Support
 
-    - [ ] Charge and chimeric spectra are special cases.
+    - [x] Charge state and adducts
+    - [ ] Chimeric spectra are special cases.
     - [x] Global modifications (e.g., every C is C13).
 
 
@@ -108,7 +112,7 @@
     # Python 2 doesn't have a builtin Enum type
     Enum = object
 
-from pyteomics.mass import Composition, std_aa_mass, Unimod
+from pyteomics.mass import Composition, std_aa_mass, Unimod, nist_mass
 from pyteomics.auxiliary import PyteomicsError, BasicComposition
 from pyteomics.auxiliary.utils import add_metaclass
 
@@ -128,6 +132,10 @@ def _needs_psims(name):
 std_aa_mass = std_aa_mass.copy()
 std_aa_mass['X'] = 0
 
+element_symbols = set(nist_mass)
+element_symbols.remove("e*")
+element_symbols.add('e')
+
 
 class ProFormaError(PyteomicsError):
     def __init__(self, message, index=None, parser_state=None, **kwargs):
@@ -480,17 +488,64 @@ def load_database(self):
         return load_gno()
 
     def get_mass_from_glycan_composition(self, term):
+        '''Parse the Byonic-style glycan composition from property GNO:00000202
+        to get the counts of each monosaccharide and use that to calculate mass.
+
+        The mass computed here is exact and dehydrated, distinct from the rounded-off
+        mass that :meth:`get_mass_from_term` will produce by walking up the CV term
+        hierarchy. However, not all glycan compositions are representable in GNO:00000202
+        format, so this may silently be absent or incomplete, hence the double-check in
+        :meth:`get_mass_from_term`.
+
+        Parameters
+        ----------
+        term : psims.controlled_vocabulary.Entity
+            The CV entity being parsed.
+
+        Returns
+        -------
+        mass : float or :const:`None`
+            If a glycan composition is found on the term, the computed
+            mass will be returned. Otherwise the :const:`None` is returned
+        '''
         val = term.get('GNO:00000202')
+        monosaccharides = BasicComposition()
+        composition = Composition()
         if val:
             tokens = re.findall(r"([A-Za-z0-9]+)\((\d+)\)", val)
             mass = 0.0
             for symbol, count in tokens:
-                mass += GlycanModification.valid_monosaccharides[symbol][0] * int(count)
-            return mass
-        return None
+                count = int(count)
+                try:
+                    mono_mass, mono_comp = GlycanModification.valid_monosaccharides[symbol]
+                    mass += mono_mass * count
+                    composition += mono_comp * count
+                    monosaccharides[symbol] += count
+                except KeyError:
+                    continue
+            return mass, monosaccharides, composition
+        return None, None, None
+
+    def get_mass_from_term(self, term, raw_mass):
+        '''Walk up the term hierarchy and find the mass group
+        term near the root of the tree, and return the most accurate
+        mass available for the provided term.
 
-    def get_mass_from_term(self, term):
-        raw_mass = self.get_mass_from_glycan_composition(term)
+        The mass group term's mass is rounded to two decimal places, leading
+        to relatively large errors.
+
+        Parameters
+        ----------
+        term : psims.controlled_vocabulary.Entity
+            The CV entity being parsed.
+
+        Returns
+        -------
+        mass : float or :const:`None`
+            If a root node is found along the term's lineage, computed
+            mass will be returned. Otherwise the :const:`None` is returned.
+            The mass may be
+        '''
         root_id = 'GNO:00000001'
         parent = term.parent()
         if isinstance(parent, list):
@@ -507,8 +562,11 @@ def get_mass_from_term(self, term):
             return None
         # This will have a small mass error.
         rough_mass = float(match.group(1)) - _water_mass
-        if abs(rough_mass - raw_mass) < 1:
+        if raw_mass is not None and abs(rough_mass - raw_mass) < 1:
             return raw_mass
+        warnings.warn(
+            ("An accurate glycan composition could not be inferred from %s. "
+             "Only a rough approximation is available.") % (term, ))
         return rough_mass
 
     def resolve(self, name=None, id=None, **kwargs):
@@ -518,12 +576,15 @@ def resolve(self, name=None, id=None, **kwargs):
             term = self.database[id]
         else:
             raise ValueError("Must provide one of `name` or `id`")
+        raw_mass, monosaccharides, composition = self.get_mass_from_glycan_composition(term)
+
         rec = {
             "name":term.name,
             "id": term.id,
             "provider": self.name,
-            "composition": None,
-            "mass": self.get_mass_from_term(term)
+            "composition": composition,
+            "monosaccharides": monosaccharides,
+            "mass": self.get_mass_from_term(term, raw_mass)
         }
         return rec
 
@@ -730,9 +791,13 @@ class GNOmeModification(ModificationBase):
     resolver = GNOResolver()
 
     prefix_name = "GNO"
-    # short_prefix = 'G'
+    short_prefix = 'G'
     _tag_type = TagTypeEnum.gnome
 
+    @property
+    def monosaccharides(self):
+        return self.definition.get('monosaccharides')
+
 
 class XLMODModification(ModificationBase):
     __slots__ = ()
@@ -993,33 +1058,113 @@ class TaggedInterval(object):
         The starting position (inclusive) of the interval along the primary sequence
     end: int
         The ending position (exclusive) of the interval along the primary sequence
-    tag: TagBase
-        The tag being localized
+    tags: list[TagBase]
+        The tags being localized
     '''
-    __slots__ = ('start', 'end', 'tag')
+    __slots__ = ('start', 'end', 'tags')
 
-    def __init__(self, start, end=None, tag=None):
+    def __init__(self, start, end=None, tags=None):
         self.start = start
         self.end = end
-        self.tag = tag
+        self.tags = tags
 
     def __eq__(self, other):
         if other is None:
             return False
-        return self.start == other.start and self.end == other.end and self.tag == other.tag
+        return self.start == other.start and self.end == other.end and self.tags == other.tags
 
     def __ne__(self, other):
         return not self == other
 
     def __str__(self):
-        return "({self.start}-{self.end}){self.tag!r}".format(self=self)
+        return "({self.start}-{self.end}){self.tags!r}".format(self=self)
 
     def __repr__(self):
-        return "{self.__class__.__name__}({self.start}, {self.end}, {self.tag})".format(self=self)
+        return "{self.__class__.__name__}({self.start}, {self.end}, {self.tags})".format(self=self)
 
     def as_slice(self):
         return slice(self.start, self.end)
 
+    def copy(self):
+        return self.__class__(self.start, self.end, self.tags)
+
+    def _update_coordinates_sliced(self, start=None, end=None, warn_ambiguous=True):
+        if end is None:
+            qend = self.end + 1
+        else:
+            qend = end
+        if start is None:
+            qstart = self.start - 1
+        else:
+            qstart = start
+
+        # Fully contained interval
+        valid = qstart <= self.start and qend >= self.end
+
+        if not valid:
+            # Spans the beginning but not the end
+            valid = qstart <= self.start and qend > self.start
+            if valid and warn_ambiguous:
+                warnings.warn("Slice bisecting interval %s" % (self, ))
+
+        if not valid:
+            # Spans the end but not the beginning
+            valid = qstart < self.end and qend > self.end
+            if valid and warn_ambiguous:
+                warnings.warn("Slice bisecting interval %s" % (self, ))
+
+        if not valid:
+            # Contained interval
+            valid = qstart >= self.start and qend < self.end
+            if valid and warn_ambiguous:
+                warnings.warn("Slice bisecting interval %s" % (self, ))
+
+        if not valid:
+            return None
+        new = self.copy()
+        if start is not None:
+            diff = self.start - start
+            if diff < 0:
+                diff = 0
+            new.start = diff
+        if end is not None:
+            width = min(new.end, end) - self.start
+        else:
+            width = self.end - max(start, self.start)
+        new.end = new.start + width
+        return new
+
+
+class ChargeState(object):
+    '''Describes the charge and adduct types of the structure.
+
+    Attributes
+    ----------
+    charge : int
+        The total charge state as a signed number.
+    adducts : list[str]
+        Each charge carrier associated with the molecule.
+    '''
+    __slots__ = ("charge", "adducts")
+
+    def __init__(self, charge, adducts=None):
+        if adducts is None:
+            adducts = []
+        self.charge = charge
+        self.adducts = adducts
+
+    def __str__(self):
+        tokens = [str(self.charge)]
+        if self.adducts:
+            tokens.append("[")
+            tokens.append(','.join(str(adduct) for adduct in self.adducts))
+            tokens.append("]")
+        return ''.join(tokens)
+
+    def __repr__(self):
+        template = "{self.__class__.__name__}({self.charge}, {self.adducts})"
+        return template.format(self=self)
+
 
 class TokenBuffer(object):
     '''A token buffer that wraps the accumulation and reset logic
@@ -1098,17 +1243,19 @@ def __call__(self):
 class NumberParser(TokenBuffer):
     '''A buffer which accumulates tokens until it is asked to parse them into
     :class:`int` instances.
+    '''
 
-    Implements a subset of the Sequence protocol.
+    def _transform(self, value):
+        return int(''.join(value))
 
-    Attributes
-    ----------
-    buffer: list
-        The list of tokens accumulated since the last parsing.
+
+class StringParser(TokenBuffer):
+    '''A buffer which accumulates tokens until it is asked to parse them into
+    :class:`str` instances.
     '''
 
     def _transform(self, value):
-        return int(''.join(value))
+        return ''.join(value)
 
 
 class TagParser(TokenBuffer):
@@ -1161,6 +1308,14 @@ class ParserStateEnum(Enum):
     post_global = 12
     post_global_aa = 13
     post_interval_tag = 14
+    post_tag_after = 15
+    charge_state_start = 16
+    charge_state_number = 17
+    charge_state_adduct_start = 18
+    charge_state_adduct_end = 19
+    inter_chain_cross_link_start = 20
+    chimeric_start = 21
+
     done = 999
 
 
@@ -1175,10 +1330,15 @@ class ParserStateEnum(Enum):
 INTERVAL_TAG = ParserStateEnum.interval_tag
 TAG_AFTER = ParserStateEnum.tag_after_sequence
 POST_TAG_BEFORE = ParserStateEnum.post_tag_before
+POST_TAG_AFTER = ParserStateEnum.post_tag_after
 UNLOCALIZED_COUNT = ParserStateEnum.unlocalized_count
 POST_GLOBAL = ParserStateEnum.post_global
 POST_GLOBAL_AA = ParserStateEnum.post_global_aa
 POST_INTERVAL_TAG = ParserStateEnum.post_interval_tag
+CHARGE_START = ParserStateEnum.charge_state_start
+CHARGE_NUMBER = ParserStateEnum.charge_state_number
+ADDUCT_START = ParserStateEnum.charge_state_adduct_start
+ADDUCT_END = ParserStateEnum.charge_state_adduct_end
 DONE = ParserStateEnum.done
 
 VALID_AA = set("QWERTYIPASDFGHKLCVNMXUOJZB")
@@ -1198,7 +1358,7 @@ def parse(sequence):
 
     Returns
     -------
-    parsed_sequence: list[tuple[str, TagBase]]
+    parsed_sequence: list[tuple[str, list[TagBase]]]
         The (amino acid: str, TagBase or None) pairs denoting the positions along the primary sequence
     modifiers: dict
         A mapping listing the labile modifications, fixed modifications, stable isotopes, unlocalized
@@ -1226,9 +1386,15 @@ def parse(sequence):
     current_unlocalized_count = NumberParser()
     current_aa_targets = TokenBuffer()
 
+    charge_buffer = None
+    adduct_buffer = None
+
+    # A mostly context free finite state machine unrolled
+    # by hand.
     while i < n:
         c = sequence[i]
         i += 1
+        # Initial state prior to sequence content
         if state == BEFORE:
             if c == '[':
                 state = TAG_BEFORE
@@ -1244,6 +1410,7 @@ def parse(sequence):
             else:
                 raise ProFormaError(
                     "Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state)
+        # The body of the amino acid sequence.
         elif state == SEQ:
             if c in VALID_AA:
                 positions.append((current_aa, current_tag() if current_tag else None))
@@ -1279,9 +1446,16 @@ def parse(sequence):
                     raise ProFormaError("Missing Closing Tag", i, state)
                 i += 1
                 depth = 1
+            elif c == '/':
+                state = CHARGE_START
+                charge_buffer = NumberParser()
+            elif c == '+':
+                raise ProFormaError(
+                    "Error In State {state}, {c} found at index {i}. Chimeric representation not supported".format(**locals()), i, state)
             else:
                 raise ProFormaError("Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state)
-        elif state == TAG or state == TAG_BEFORE or state == TAG_AFTER or state == GLOBAL:
+        # Tag parsing which rely on `current_tag` to buffer tokens.
+        elif state == TAG or state == TAG_BEFORE or state == TAG_AFTER or state == GLOBAL or state == INTERVAL_TAG:
             if c == '[':
                 depth += 1
                 current_tag.append(c)
@@ -1295,20 +1469,26 @@ def parse(sequence):
                         state = POST_TAG_BEFORE
                     elif state == TAG_AFTER:
                         c_term = current_tag()
-                        state = DONE
+                        state = POST_TAG_AFTER
                     elif state == GLOBAL:
                         state = POST_GLOBAL
+                    elif state == INTERVAL_TAG:
+                        state = POST_INTERVAL_TAG
+                        depth = 0
                 else:
                     current_tag.append(c)
             else:
                 current_tag.append(c)
+        # Handle transition to fixed modifications or isotope labeling from opening signal.
         elif state == FIXED:
             if c == '[':
                 state = GLOBAL
             else:
                 # Do validation here
                 state = ISOTOPE
+                current_tag.reset()
                 current_tag.append(c)
+        # Handle fixed isotope rules, which rely on `current_tag` to buffer tokens
         elif state == ISOTOPE:
             if c != '>':
                 current_tag.append(c)
@@ -1317,6 +1497,7 @@ def parse(sequence):
                 isotopes.append(StableIsotope(''.join(current_tag)))
                 current_tag.reset()
                 state = BEFORE
+        # Handle labile modifications, which rely on `current_tag` to buffer tokens
         elif state == LABILE:
             if c == '{':
                 depth += 1
@@ -1328,26 +1509,18 @@ def parse(sequence):
                     state = BEFORE
             else:
                 current_tag.append(c)
-        elif state == INTERVAL_TAG:
-            if c == '[':
-                depth += 1
-                current_tag.append(c)
-            elif c == ']':
-                depth -= 1
-                if depth <= 0:
-                    state = POST_INTERVAL_TAG
-                    depth = 0
-                else:
-                    current_tag.append(c)
-            else:
-                current_tag.append(c)
+        # The intermediate state between an interval tag and returning to sequence parsing.
+        # A new tag may start immediately, leading to it being appended to the interval instead
+        # instead of returning to the primary sequence. Because this state may also occur at the
+        # end of a sequence, it must also handle sequence-terminal transitions like C-terminal tags,
+        # charge states, and the like.
         elif state == POST_INTERVAL_TAG:
             if c == '[':
                 current_tag.bound()
                 state = INTERVAL_TAG
             elif c in VALID_AA:
                 current_aa = c
-                current_interval.tag = current_tag()
+                current_interval.tags = current_tag()
                 intervals.append(current_interval)
                 current_interval = None
                 state = SEQ
@@ -1357,6 +1530,17 @@ def parse(sequence):
                     raise ProFormaError("Missing Closing Tag", i, state)
                 i += 1
                 depth = 1
+            elif c == '/':
+                state = CHARGE_START
+                charge_buffer = NumberParser()
+            elif c == '+':
+                raise ProFormaError(
+                    "Error In State {state}, {c} found at index {i}. Chimeric representation not supported".format(**locals()), i, state)
+            else:
+                raise ProFormaError(
+                    "Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state)
+        # An intermediate state for discriminating which type of tag-before-sequence type
+        # we just finished parsing.
         elif state == POST_TAG_BEFORE:
             if c == '?':
                 unlocalized_modifications.append(current_tag()[0])
@@ -1405,8 +1589,57 @@ def parse(sequence):
             else:
                 raise ProFormaError(
                     ("Error In State {state}, unclosed fixed modification rule").format(**locals()), i, state)
+        elif state == POST_TAG_AFTER:
+            if c == '/':
+                state = CHARGE_START
+                charge_buffer = NumberParser()
+            elif c == '+':
+                raise ProFormaError(
+                    "Error In State {state}, {c} found at index {i}. Chimeric representation not supported".format(**locals()), i, state)
+        elif state == CHARGE_START:
+            if c in '+-':
+                charge_buffer.append(c)
+                state = CHARGE_NUMBER
+            elif c.isdigit():
+                charge_buffer.append(c)
+                state = CHARGE_NUMBER
+            elif c == '/':
+                state = ParserStateEnum.inter_chain_cross_link_start
+                raise ProFormaError("Inter-chain cross-linked peptides are not yet supported", i, state)
+            else:
+                raise ProFormaError(
+                    "Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state)
+        elif state == CHARGE_NUMBER:
+            if c.isdigit():
+                charge_buffer.append(c)
+            elif c == "[":
+                state = ADDUCT_START
+                adduct_buffer = StringParser()
+            else:
+                raise ProFormaError(
+                    "Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state)
+        elif state == ADDUCT_START:
+            if c.isdigit() or c in "+-" or c in element_symbols:
+                adduct_buffer.append(c)
+            elif c == ',':
+                adduct_buffer.bound()
+            elif c == ']':
+                state = ADDUCT_END
+        elif state == ADDUCT_END:
+            if c == '+':
+                raise ProFormaError(
+                    "Error In State {state}, {c} found at index {i}. Chimeric representation not supported".format(**locals()), i, state)
         else:
             raise ProFormaError("Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state)
+    if charge_buffer:
+        charge_number = charge_buffer()
+        if adduct_buffer:
+            adducts = adduct_buffer()
+        else:
+            adducts = None
+        charge_state = ChargeState(charge_number, adducts)
+    else:
+        charge_state = None
     if current_aa:
         positions.append((current_aa, current_tag() if current_tag else None))
     if state in (ISOTOPE, TAG, TAG_AFTER, TAG_BEFORE, LABILE, ):
@@ -1419,13 +1652,14 @@ def parse(sequence):
         'fixed_modifications': fixed_modifications,
         'intervals': intervals,
         'isotopes': isotopes,
-        'group_ids': sorted(current_tag.group_ids)
+        'group_ids': sorted(current_tag.group_ids),
+        'charge_state': charge_state
     }
 
 
 def to_proforma(sequence, n_term=None, c_term=None, unlocalized_modifications=None,
                 labile_modifications=None, fixed_modifications=None, intervals=None,
-                isotopes=None, group_ids=None):
+                isotopes=None, charge_state=None, group_ids=None):
     '''Convert a sequence plus modifiers into formatted text following the
     ProForma specification.
 
@@ -1447,6 +1681,8 @@ def to_proforma(sequence, n_term=None, c_term=None, unlocalized_modifications=No
         A list of modified intervals, if any
     isotopes : Optional[list[StableIsotope]]
         Any global stable isotope labels applied
+    charge_state : Optional[ChargeState]
+        An optional charge state value
     group_ids : Optional[list[str]]
         Any group identifiers. This parameter is currently not used.
 
@@ -1465,11 +1701,13 @@ def to_proforma(sequence, n_term=None, c_term=None, unlocalized_modifications=No
             primary[iv.start] = '(' + primary[iv.start]
 
             primary[iv.end - 1] = '{0!s})'.format(
-                primary[iv.end - 1]) + ''.join('[{!s}]'.format(t) for t in iv.tag)
+                primary[iv.end - 1]) + ''.join('[{!s}]'.format(t) for t in iv.tags)
     if n_term:
         primary.appendleft(''.join("[{!s}]".format(t) for t in n_term) + '-')
     if c_term:
         primary.append('-' + ''.join("[{!s}]".format(t) for t in c_term))
+    if charge_state:
+        primary.append("/{!s}".format(charge_state))
     if labile_modifications:
         primary.extendleft(['{{{!s}}}'.format(m) for m in labile_modifications])
     if unlocalized_modifications:
@@ -1482,11 +1720,46 @@ def to_proforma(sequence, n_term=None, c_term=None, unlocalized_modifications=No
     return ''.join(primary)
 
 
+class _ProFormaProperty(object):
+    def __init__(self, name):
+        self.name = name
+
+    def __get__(self, obj, cls):
+        return obj.properties[self.name]
+
+    def __set__(self, obj, value):
+        obj.properties[self.name] = value
+
+    def __repr__(self):
+        template = "{self.__class__.__name__}({self.name!r})"
+        return template.format(self=self)
+
+
 class ProForma(object):
+    '''Represent a parsed ProForma sequence.
+
+    Attributes
+    ----------
+    sequence : list[tuple[]]
+    '''
+
     def __init__(self, sequence, properties):
         self.sequence = sequence
         self.properties = properties
 
+    isotopes = _ProFormaProperty('isotopes')
+    charge_state = _ProFormaProperty('charge_state')
+
+    intervals = _ProFormaProperty('intervals')
+    fixed_modifications = _ProFormaProperty('fixed_modifications')
+    labile_modifications = _ProFormaProperty('labile_modifications')
+    unlocalized_modifications = _ProFormaProperty('unlocalized_modifications')
+
+    n_term = _ProFormaProperty('n_term')
+    c_term = _ProFormaProperty('c_term')
+
+    group_ids = _ProFormaProperty('group_ids')
+
     def __str__(self):
         return to_proforma(self.sequence, **self.properties)
 
@@ -1496,6 +1769,14 @@ def __repr__(self):
     def __getitem__(self, i):
         if isinstance(i, slice):
             props = self.properties.copy()
+            ivs = []
+            for iv in props['intervals']:
+                iv = iv._update_coordinates_sliced(
+                    i.start, i.stop)
+                if iv is None:
+                    continue
+                ivs.append(iv)
+            props['intervals'] = ivs
             return self.__class__(self.sequence[i], props)
         else:
             return self.sequence[i]
@@ -1569,8 +1850,6 @@ def mass(self):
     def find_tags_by_id(self, tag_id, include_position=True):
         if not tag_id.startswith("#"):
             tag_id = "#" + tag_id
-        if tag_id not in self.properties['group_ids']:
-            return []
         matches = []
         for i, (_token, tags) in enumerate(self.sequence):
             if tags:

From 325088d68053ac5bdfaed7bb08e0009e26464d57 Mon Sep 17 00:00:00 2001
From: Joshua Klein <mobiusklein@gmail.com>
Date: Sun, 13 Jun 2021 15:56:53 -0400
Subject: [PATCH 25/27] Add ambiguous sequence regions

---
 pyteomics/proforma.py  | 100 ++++++++++++++++++++++++++++-------------
 tests/test_proforma.py |  11 ++++-
 2 files changed, 79 insertions(+), 32 deletions(-)

diff --git a/pyteomics/proforma.py b/pyteomics/proforma.py
index d88c3510..67d08fa2 100644
--- a/pyteomics/proforma.py
+++ b/pyteomics/proforma.py
@@ -1048,9 +1048,17 @@ def __repr__(self):
         return "{self.__class__.__name__}({self.isotope})".format(self=self)
 
 
+class IntersectionEnum(Enum):
+    no_overlap = 0
+    full_contains_interval = 1
+    full_contained_in_interval = 2
+    start_overlap = 3
+    end_overlap = 4
+
+
 class TaggedInterval(object):
     '''Define a fixed interval over the associated sequence which contains the localization
-    of the associated tag.
+    of the associated tag or denotes a region of general sequence order ambiguity.
 
     Attributes
     ----------
@@ -1060,13 +1068,16 @@ class TaggedInterval(object):
         The ending position (exclusive) of the interval along the primary sequence
     tags: list[TagBase]
         The tags being localized
+    ambiguous : bool
+        Whether the interval is ambiguous or not
     '''
-    __slots__ = ('start', 'end', 'tags')
+    __slots__ = ('start', 'end', 'tags', 'ambiguous')
 
-    def __init__(self, start, end=None, tags=None):
+    def __init__(self, start, end=None, tags=None, ambiguous=False):
         self.start = start
         self.end = end
         self.tags = tags
+        self.ambiguous = ambiguous
 
     def __eq__(self, other):
         if other is None:
@@ -1088,37 +1099,48 @@ def as_slice(self):
     def copy(self):
         return self.__class__(self.start, self.end, self.tags)
 
-    def _update_coordinates_sliced(self, start=None, end=None, warn_ambiguous=True):
-        if end is None:
-            qend = self.end + 1
-        else:
-            qend = end
-        if start is None:
-            qstart = self.start - 1
-        else:
-            qstart = start
-
+    def _check_slice(self, qstart, qend, warn_ambiguous):
         # Fully contained interval
         valid = qstart <= self.start and qend >= self.end
-
+        case = IntersectionEnum.full_contained_in_interval if valid else IntersectionEnum.no_overlap
         if not valid:
             # Spans the beginning but not the end
             valid = qstart <= self.start and qend > self.start
-            if valid and warn_ambiguous:
-                warnings.warn("Slice bisecting interval %s" % (self, ))
+            if valid:
+                case = IntersectionEnum.start_overlap
+                if warn_ambiguous:
+                    warnings.warn("Slice bisecting interval %s" % (self, ))
 
         if not valid:
             # Spans the end but not the beginning
             valid = qstart < self.end and qend > self.end
-            if valid and warn_ambiguous:
-                warnings.warn("Slice bisecting interval %s" % (self, ))
+            if valid:
+                case = IntersectionEnum.end_overlap
+                if warn_ambiguous:
+                    warnings.warn("Slice bisecting interval %s" % (self, ))
 
         if not valid:
             # Contained interval
             valid = qstart >= self.start and qend < self.end
-            if valid and warn_ambiguous:
-                warnings.warn("Slice bisecting interval %s" % (self, ))
+            if valid:
+                case = IntersectionEnum.full_contains_interval
+                if warn_ambiguous:
+                    warnings.warn("Slice bisecting interval %s" % (self, ))
+        return valid, case
+
+    def _update_coordinates_sliced(self, start=None, end=None, warn_ambiguous=True):
+        if end is None:
+            qend = self.end + 1
+        else:
+            qend = end
+        if start is None:
+            qstart = self.start - 1
+        else:
+            qstart = start
 
+        valid, intersection_type = self._check_slice(qstart, qend, warn_ambiguous)
+        if self.ambiguous and intersection_type not in (IntersectionEnum.full_contained_in_interval, IntersectionEnum.no_overlap):
+            raise ValueError("Cannot bisect an ambiguous interval")
         if not valid:
             return None
         new = self.copy()
@@ -1315,7 +1337,7 @@ class ParserStateEnum(Enum):
     charge_state_adduct_end = 19
     inter_chain_cross_link_start = 20
     chimeric_start = 21
-
+    interval_initial = 22
     done = 999
 
 
@@ -1328,6 +1350,7 @@ class ParserStateEnum(Enum):
 SEQ = ParserStateEnum.sequence
 TAG = ParserStateEnum.tag_in_sequence
 INTERVAL_TAG = ParserStateEnum.interval_tag
+INTERVAL_INIT = ParserStateEnum.interval_initial
 TAG_AFTER = ParserStateEnum.tag_after_sequence
 POST_TAG_BEFORE = ParserStateEnum.post_tag_before
 POST_TAG_AFTER = ParserStateEnum.post_tag_after
@@ -1411,9 +1434,16 @@ def parse(sequence):
                 raise ProFormaError(
                     "Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state)
         # The body of the amino acid sequence.
-        elif state == SEQ:
+        elif state == SEQ or state == INTERVAL_INIT:
+            if state == INTERVAL_INIT:
+                state = SEQ
+                if c == '?':
+                    if current_interval is not None:
+                        current_interval.ambiguous = True
+                    continue
             if c in VALID_AA:
-                positions.append((current_aa, current_tag() if current_tag else None))
+                if current_aa is not None:
+                    positions.append((current_aa, current_tag() if current_tag else None))
                 current_aa = c
             elif c == '[':
                 state = TAG
@@ -1427,6 +1457,7 @@ def parse(sequence):
                          "Nested ranges are not yet supported by ProForma.").format(
                             **locals()), i, state)
                 current_interval = TaggedInterval(len(positions) + 1)
+                state = INTERVAL_INIT
             elif c == ')':
                 positions.append(
                     (current_aa, current_tag() if current_tag else None))
@@ -1435,11 +1466,13 @@ def parse(sequence):
                     raise ProFormaError("Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state)
                 else:
                     current_interval.end = len(positions)
-                    if i >= n or sequence[i] != '[':
-                        raise ProFormaError("Missing Interval Tag", i, state)
-                    i += 1
-                    depth = 1
-                    state = INTERVAL_TAG
+                    if i < n and sequence[i] == '[':
+                        i += 1
+                        depth = 1
+                        state = INTERVAL_TAG
+                    else:
+                        intervals.append(current_interval)
+                        current_interval = None
             elif c == '-':
                 state = TAG_AFTER
                 if i >= n or sequence[i] != '[':
@@ -1698,10 +1731,15 @@ def to_proforma(sequence, n_term=None, c_term=None, unlocalized_modifications=No
             primary.append(str(aa) + ''.join(['[{0!s}]'.format(t) for t in tags]))
     if intervals:
         for iv in sorted(intervals, key=lambda x: x.start):
-            primary[iv.start] = '(' + primary[iv.start]
+            if iv.ambiguous:
+                primary[iv.start] = '(?' + primary[iv.start]
+            else:
+                primary[iv.start] = '(' + primary[iv.start]
 
-            primary[iv.end - 1] = '{0!s})'.format(
-                primary[iv.end - 1]) + ''.join('[{!s}]'.format(t) for t in iv.tags)
+            terminator = '{0!s})'.format(primary[iv.end - 1])
+            if iv.tags:
+                terminator += ''.join('[{!s}]'.format(t) for t in iv.tags)
+            primary[iv.end - 1] = terminator
     if n_term:
         primary.appendleft(''.join("[{!s}]".format(t) for t in n_term) + '-')
     if c_term:
diff --git a/tests/test_proforma.py b/tests/test_proforma.py
index b19673ad..e47684c1 100644
--- a/tests/test_proforma.py
+++ b/tests/test_proforma.py
@@ -32,10 +32,19 @@ def test_complicated_short(self):
             ProForma(tokens, properties).mass, 1210.5088, 3)
 
 
-    def test_ranges(self):
+    def test_range(self):
         seq = "PRQT(EQC[Carbamidomethyl]FQRMS)[+19.0523]ISK"
         parsed = proforma.ProForma.parse(seq)
         assert str(parsed) == seq
+        chunk = parsed[:6]
+        assert chunk.intervals
+
+    def test_ambiguous_range(self):
+        seq = "PRQT(?EQC[Carbamidomethyl]FQRMS)ISK"
+        parsed = proforma.ProForma.parse(seq)
+        assert str(parsed) == seq
+        self.assertRaises(ValueError, lambda: parsed[:6])
+
 
     def test_error_on_nested_range(self):
         self.assertRaises(proforma.ProFormaError, lambda: parse(

From c54476565af97e7fc92852bda4dfa58a41ac0a5a Mon Sep 17 00:00:00 2001
From: Joshua Klein <mobiusklein@gmail.com>
Date: Sun, 13 Jun 2021 16:00:31 -0400
Subject: [PATCH 26/27] ProForma testing requires psims

---
 test-requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test-requirements.txt b/test-requirements.txt
index 42b523ab..1af7e4e9 100644
--- a/test-requirements.txt
+++ b/test-requirements.txt
@@ -8,3 +8,4 @@ h5py
 hdf5plugin < 3.0.0; python_version < '3'
 hdf5plugin; python_version > '3.1'
 pynumpress
+psims
\ No newline at end of file

From 43fcebe5028c6aed1ac2eb0070b96f3907515601 Mon Sep 17 00:00:00 2001
From: Joshua Klein <mobiusklein@gmail.com>
Date: Sun, 13 Jun 2021 16:01:28 -0400
Subject: [PATCH 27/27] ci

---
 pyteomics/proforma.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyteomics/proforma.py b/pyteomics/proforma.py
index 67d08fa2..e6c432b0 100644
--- a/pyteomics/proforma.py
+++ b/pyteomics/proforma.py
@@ -1686,7 +1686,7 @@ def parse(sequence):
         'intervals': intervals,
         'isotopes': isotopes,
         'group_ids': sorted(current_tag.group_ids),
-        'charge_state': charge_state
+        'charge_state': charge_state,
     }