From c98feca822cf0028f5d43f7586402fed6f0a8436 Mon Sep 17 00:00:00 2001 From: Joshua Klein Date: Sat, 12 Dec 2020 15:45:51 -0500 Subject: [PATCH 01/27] experimenting --- proforma_parsing.ipynb | 460 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 460 insertions(+) create mode 100644 proforma_parsing.ipynb diff --git a/proforma_parsing.ipynb b/proforma_parsing.ipynb new file mode 100644 index 00000000..f87866a1 --- /dev/null +++ b/proforma_parsing.ipynb @@ -0,0 +1,460 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "from collections import namedtuple, defaultdict\n", + "from enum import Enum\n", + "\n", + "from six import add_metaclass" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [], + "source": [ + "class PrefixSavingMeta(type):\n", + " def __new__(mcs, name, parents, attrs):\n", + " new_type = type.__new__(mcs, name, parents, attrs)\n", + " prefix = attrs.get(\"prefix_name\")\n", + " if prefix:\n", + " new_type.prefix_map[prefix] = new_type\n", + " short = attrs.get(\"short_prefix\")\n", + " if short:\n", + " new_type.prefix_map[short] = new_type\n", + " return new_type" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": {}, + "outputs": [], + "source": [ + "class TagTypeEnum(Enum):\n", + " unimod = 0\n", + " psimod = 1\n", + " massmod = 2\n", + " generic = 3\n", + " info = 4\n", + " gnome = 5\n", + " formula = 6\n", + " glycan = 7\n", + " xlmod = 8\n", + " localization_marker = 9\n", + " group_placeholder = 999\n", + " \n", + "\n", + "@add_metaclass(PrefixSavingMeta)\n", + "class TagBase(object):\n", + " __slots__ = (\"type\", \"value\", \"extra\", \"group_id\")\n", + "\n", + " prefix_name = None\n", + " short_prefix = None\n", + " prefix_map = {}\n", + " \n", + " def __init__(self, type, value, extra=None, group_id=None):\n", + " self.type = type\n", + " self.value = value\n", + " self.extra = extra or []\n", + " self.group_id = group_id\n", + "\n", + " def __str__(self):\n", + " part = self._format_main()\n", + " if self.extra:\n", + " rest = [str(e) for e in self.extra]\n", + " label = '|'.join([part] + rest)\n", + " else:\n", + " label = part\n", + " if self.group_id:\n", + " label = '%s#%s' % (label, self.group_id)\n", + " return label\n", + " \n", + " def __repr__(self):\n", + " template = \"{self.__class__.__name__}({self.value!r}, {self.extra!r}, {self.group_id!r})\"\n", + " return template.format(self=self)\n", + "\n", + "\n", + "class LocalizationMarker(TagBase):\n", + " __slots__ = ()\n", + " \n", + " def __init__(self, value, extra=None, group_id=None):\n", + " assert group_id is not None\n", + " super(LocalizationMarker, self).__init__(TagTypeEnum.localization_marker, float(value), extra, group_id)\n", + " \n", + " def _format_main(self):\n", + " return \"#{self.group_id}({self.value!f})\".format(self=self)\n", + " \n", + " \n", + "class MassModification(TagBase):\n", + " __slots__ = ()\n", + " \n", + " def __init__(self, value, extra=None, group_id=None):\n", + " super(MassModification, self).__init__(TagTypeEnum.massmod, float(value), extra, group_id)\n", + " \n", + " def _format_main(self):\n", + " return '%0.4f' % self.value\n", + "\n", + " \n", + "class ControlledVocabularyModificationBase(TagBase):\n", + " _tag_type = None\n", + " __slots__ = ()\n", + " \n", + " def __init__(self, value, extra=None, group_id=None):\n", + " super(ControlledVocabularyModificationBase, self).__init__(\n", + " self._tag_type, value, extra, group_id)\n", + "\n", + " def _format_main(self):\n", + " return \"{self.prefix_name}:{self.value}\".format(self=self)\n", + "\n", + " \n", + "class GenericModification(TagBase):\n", + " __slots__ = ()\n", + " \n", + " def __init__(self, value, extra=None, group_id=None):\n", + " super(GenericModification, self).__init__(TagTypeEnum.generic, value, extra, group_id)\n", + " \n", + " def _format_main(self):\n", + " return self.value\n", + " \n", + "\n", + "class UnimodModification(ControlledVocabularyModificationBase):\n", + " __slots__ = ()\n", + " \n", + " prefix_name = \"UNIMOD\"\n", + " short_prefix = \"U\"\n", + " _tag_type = TagTypeEnum.unimod\n", + "\n", + "\n", + "class PSIModModification(ControlledVocabularyModificationBase):\n", + " __slots__ = ()\n", + " \n", + " prefix_name = \"MOD\"\n", + " short_prefix = 'M'\n", + " _tag_type = TagTypeEnum.psimod\n", + "\n", + "\n", + "class GNOmeModification(ControlledVocabularyModificationBase):\n", + " __slots__ = ()\n", + " \n", + " prefix_name = \"GNO\"\n", + " short_prefix = 'G'\n", + " _tag_type = TagTypeEnum.gnome\n", + "\n", + " \n", + "class XLMODModification(ControlledVocabularyModificationBase):\n", + " __slots__ = ()\n", + " \n", + " prefix_name = \"XLMOD\"\n", + "# short_prefix = 'XL'\n", + " _tag_type = TagTypeEnum.xlmod\n", + " \n", + " \n", + "class TagParserStateEnum(Enum):\n", + " start = 0\n", + " group_id = 1\n", + "\n", + "def split_tags(tokens):\n", + " starts = [0]\n", + " ends = []\n", + " for i, c in enumerate(tokens):\n", + " if c == '|':\n", + " ends.append(i)\n", + " starts.append(i + 1)\n", + " ends.append(len(tokens))\n", + " out = []\n", + " for i, start in enumerate(starts):\n", + " end = ends[i]\n", + " out.append(tokens[start:end])\n", + " return out\n", + "\n", + "def find_prefix(tokens):\n", + " for i, c in enumerate(tokens):\n", + " if c == ':':\n", + " return ''.join(tokens[:i]), ''.join(tokens[i + 1:])\n", + " return None, tokens\n", + " \n", + "def process_tag_tokens(tokens):\n", + " parts = split_tags(tokens)\n", + " main_tag = parts[0]\n", + " if main_tag[0] in ('+', '-'):\n", + " main_tag = ''.join(main_tag)\n", + " main_tag = MassModification(main_tag)\n", + " else:\n", + " prefix, value = find_prefix(main_tag)\n", + " if prefix is None:\n", + " main_tag = GenericModification(''.join(value))\n", + " else:\n", + " tag_type = TagBase.prefix_map[prefix]\n", + " main_tag = tag_type(value)\n", + " if len(parts) > 1:\n", + " extras = []\n", + " for part in parts:\n", + " prefix, value = find_prefix(part)\n", + " if prefix is None:\n", + " if value.startswith(\"#\"):\n", + " main_tag.group_id = value\n", + " else:\n", + " main_tag.extra.append(GenericModification(''.join(value)))\n", + " else:\n", + " tag_type = TagBase.prefix_map[prefix]\n", + " main_tag.extra.append(tag_type(value))\n", + " return main_tag" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'QWERTYIPASDFGHKLCVNM'" + ] + }, + "execution_count": 103, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pyteomics import parser\n", + "''.join(parser.std_amino_acids)" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": {}, + "outputs": [], + "source": [ + "class ParserStateEnum(Enum):\n", + " before_sequence = 0\n", + " tag_before_sequence = 1\n", + " global_tag = 2\n", + " fixed_spec = 3\n", + " labile_tag = 4\n", + " sequence = 5\n", + " tag_in_sequence = 6\n", + " interval_tag = 7\n", + " tag_after_sequence = 8\n", + " \n", + " done = 999\n", + "\n", + "\n", + "BEFORE = ParserStateEnum.before_sequence\n", + "TAG_BEFORE = ParserStateEnum.tag_before_sequence\n", + "FIXED = ParserStateEnum.fixed_spec\n", + "GLOBAL = ParserStateEnum.global_tag\n", + "LABILE = ParserStateEnum.labile_tag\n", + "SEQ = ParserStateEnum.sequence\n", + "TAG = ParserStateEnum.tag_in_sequence\n", + "INTERVAL_TAG = ParserStateEnum.interval_tag\n", + "TAG_AFTER = ParserStateEnum.tag_after_sequence\n", + "DONE = ParserStateEnum.done\n", + "\n", + "VALID_AA = set(\"QWERTYIPASDFGHKLCVNM\")\n", + "\n", + "def tokenize_proforma(sequence):\n", + " labile_modifications = []\n", + " fixed_modifications = []\n", + " unlocalized_modifications = []\n", + " intervals = []\n", + " isotopes = []\n", + " \n", + " n_term = None\n", + " c_term = None\n", + " \n", + " i = 0\n", + " n = len(sequence)\n", + " \n", + " positions = []\n", + " state = BEFORE\n", + " depth = 0\n", + " \n", + " current_aa = None\n", + " current_tag = []\n", + " current_interval = None\n", + " \n", + " while i < n:\n", + " c = sequence[i]\n", + " i += 1\n", + " if state == BEFORE:\n", + " if c == '[':\n", + " state = TAG_BEFORE\n", + " depth = 1\n", + " elif c == '{':\n", + " state = LABILE\n", + " depth = 1\n", + " elif c == '<':\n", + " state = FIXED\n", + " elif c in VALID_AA:\n", + " current_aa = c\n", + " state = SEQ\n", + " elif c == '?':\n", + " if current_tag:\n", + " unlocalized_modifications.append(process_tag_tokens(current_tag))\n", + " current_tag = []\n", + " else:\n", + " raise Exception(\"Error In State {state}, unexpected {c} found at index {i}\".format(**locals()))\n", + " elif c == '-':\n", + " if current_tag:\n", + " n_term = process_tag_tokens(current_tag)\n", + " current_tag = []\n", + " else:\n", + " raise Exception(\"Error In State {state}, unexpected {c} found at index {i}\".format(**locals()))\n", + " else:\n", + " raise Exception(\"Error In State {state}, unexpected {c} found at index {i}\".format(**locals()))\n", + " elif state == SEQ:\n", + " if c in VALID_AA:\n", + " positions.append((current_aa, process_tag_tokens(current_tag) if current_tag else None))\n", + " current_aa = c\n", + " current_tag = []\n", + " elif c == '[':\n", + " state = TAG\n", + " depth = 1\n", + " elif c == '(':\n", + " current_interval = [len(positions), None, None]\n", + " elif c == ')':\n", + " if current_interval is None:\n", + " raise Exception(\"Error In State {state}, unexpected {c} found at index {i}\".format(**locals()))\n", + " else:\n", + " current_interval[1] = len(positions)\n", + " if i >= n or sequence[i] != '[':\n", + " raise Exception(\"Missing Interval Tag\")\n", + " i += 1\n", + " depth = 1\n", + " state = INTERVAL_TAG\n", + " elif c == '-':\n", + " state = TAG_AFTER\n", + " if i >= n or sequence[i] != '[':\n", + " raise Exception(\"Missing Interval Tag\")\n", + " i += 1\n", + " depth = 1 \n", + " else:\n", + " raise Exception(\"Error In State {state}, unexpected {c} found at index {i}\".format(**locals()))\n", + " elif state == TAG or state == TAG_BEFORE or state == TAG_AFTER:\n", + " if c == '[':\n", + " depth += 1\n", + " elif c == ']':\n", + " depth -= 1\n", + " if depth <= 0:\n", + " depth = 0\n", + " if state == TAG: \n", + " state = SEQ\n", + " elif state == TAG_BEFORE:\n", + " state = BEFORE\n", + " elif state == TAG_AFTER:\n", + " c_term = process_tag_tokens(current_tag)\n", + " state = DONE\n", + " else:\n", + " current_tag.append(c)\n", + " elif state == LABILE:\n", + " if c == '{':\n", + " depth += 1\n", + " elif c == '}':\n", + " depth -= 1\n", + " if depth <= 0:\n", + " depth = 0\n", + " labile_modifications.append(process_tag_tokens(current_tag))\n", + " current_tag = []\n", + " state = BEFORE\n", + " else:\n", + " current_tag.append(c)\n", + " elif state == INTERVAL_TAG:\n", + " if c == '[':\n", + " depth += 1\n", + " elif c == ']':\n", + " depth -= 1\n", + " if depth <= 0:\n", + " depth = 0\n", + " current_interval[2] = process_tag_tokens(current_tag)\n", + " current_tag = []\n", + " intervals.append(current_interval)\n", + " current_interval = None\n", + " state = SEQ\n", + " else:\n", + " current_tag.append(c)\n", + " else:\n", + " raise Exception(\"Error In State {state}, unexpected {c} found at index {i}\".format(**locals()))\n", + " if current_aa:\n", + " positions.append((current_aa, process_tag_tokens(current_tag) if current_tag else None))\n", + " return positions, {\n", + " 'n_term': n_term,\n", + " 'c_term': c_term,\n", + " 'unlocalized_modifications': unlocalized_modifications,\n", + " 'labile_modifications': labile_modifications,\n", + " 'intervals': intervals,\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "([('S', None),\n", + " ('T', UnimodModification('Ox', [], None)),\n", + " ('E', None),\n", + " ('P', None),\n", + " ('P', None),\n", + " ('I', None),\n", + " ('N', None),\n", + " ('G', None)],\n", + " {'n_term': GenericModification('Hex', [], None),\n", + " 'c_term': None,\n", + " 'unlocalized_modifications': [],\n", + " 'labile_modifications': [GenericModification('Foo', [], None)],\n", + " 'intervals': [[1, 4, MassModification(18.0, [], None)]]})" + ] + }, + "execution_count": 110, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "seq, fields = tokenize_proforma(\"{Foo}[Hex]-ST[U:Ox](EPP)[+18]ING\")\n", + "seq, fields" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From c79c3e89212e5fab14401574c9aabc1ab66d10f8 Mon Sep 17 00:00:00 2001 From: Joshua Klein Date: Sat, 19 Dec 2020 23:00:26 -0500 Subject: [PATCH 02/27] A draft parser for ProForma without any semantics on the returned object --- pyteomics/proforma.py | 815 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 815 insertions(+) create mode 100644 pyteomics/proforma.py diff --git a/pyteomics/proforma.py b/pyteomics/proforma.py new file mode 100644 index 00000000..26ea3355 --- /dev/null +++ b/pyteomics/proforma.py @@ -0,0 +1,815 @@ +'''A simple ProForma lexer + +The primary interface is through :func:`parse_proforma`: + + >>> parse_proforma("EM[Oxidation]EVT[#g1(0.01)]S[#g1(0.09)]ES[Phospho#g1(0.90)]PEK") + ([('E', None), + ('M', GenericModification('Oxidation', None, None)), + ('E', None), + ('V', None), + ('T', LocalizationMarker(0.01, None, '#g1')), + ('S', LocalizationMarker(0.09, None, '#g1')), + ('E', None), + ('S', + GenericModification('Phospho', [LocalizationMarker(0.9, None, '#g1')], '#g1')), + ('P', None), + ('E', None), + ('K', None)], + {'n_term': None, + 'c_term': None, + 'unlocalized_modifications': [], + 'labile_modifications': [], + 'fixed_modifications': [], + 'intervals': [], + 'isotopes': [], + 'group_ids': ['#g1']}) + +''' + +import re +from collections import namedtuple, defaultdict + +try: + from enum import Enum +except ImportError: + # Python 2 doesn't have a builtin Enum type + Enum = object + +from six import add_metaclass + +from pyteomics import parser +from pyteomics.mass import Composition + + +class PrefixSavingMeta(type): + '''A subclass-registering-metaclass that provides easy + lookup of subclasses by prefix attributes. + ''' + + def __new__(mcs, name, parents, attrs): + new_type = type.__new__(mcs, name, parents, attrs) + prefix = attrs.get("prefix_name") + if prefix: + new_type.prefix_map[prefix.lower()] = new_type + short = attrs.get("short_prefix") + if short: + new_type.prefix_map[short.lower()] = new_type + return new_type + + def find_by_tag(self, tag_name): + if tag_name is None: + raise ValueError("tag_name cannot be None!") + tag_name = tag_name.lower() + return self.prefix_map[tag_name] + + +class TagTypeEnum(Enum): + unimod = 0 + psimod = 1 + massmod = 2 + generic = 3 + info = 4 + gnome = 5 + xlmod = 6 + + formula = 7 + glycan = 8 + + localization_marker = 9 + position_label = 10 + group_placeholder = 999 + + +_sentinel = object() + + +@add_metaclass(PrefixSavingMeta) +class TagBase(object): + '''A base class for all tag types. + + Attributes + ---------- + type: Enum + An element of :class:`TagTypeEnum` saying what kind of tag this is. + value: object + The data stored in this tag, usually an externally controlled name + extra: list + Any extra tags that were nested within this tag. Usually limited to INFO + tags but may be other synonymous controlled vocabulary terms. + group_id: str or None + A short label denoting which group, if any, this tag belongs to + ''' + __slots__ = ("type", "value", "extra", "group_id") + + prefix_name = None + short_prefix = None + prefix_map = {} + + def __init__(self, type, value, extra=None, group_id=None): + self.type = type + self.value = value + self.extra = extra + self.group_id = group_id + + def __str__(self): + part = self._format_main() + if self.extra: + rest = [str(e) for e in self.extra] + label = '|'.join([part] + rest) + else: + label = part + if self.group_id: + label = '%s%s' % (label, self.group_id) + return '%s' % label + + def __repr__(self): + template = "{self.__class__.__name__}({self.value!r}, {self.extra!r}, {self.group_id!r})" + return template.format(self=self) + + def __eq__(self, other): + if other is None: + return False + return (self.type == other.type) and (self.value == other.value) and (self.extra == other.extra) \ + and (self.group_id == other.group_id) + + def __ne__(self, other): + return not self == other + + def find_extra(self, label): + out = [] + if not self.extra: + return out + for e in self.extra: + if e.type == label: + out.append(e) + return out + + +class PositionLabelTag(TagBase): + '''A tag to mark that a position is involved in a group in some way, but does + not imply any specific semantics. + ''' + __slots__ = () + + def __init__(self, value=None, extra=None, group_id=None): + assert group_id is not None + super(PositionLabelTag, self).__init__( + TagTypeEnum.position_label, group_id, extra, group_id) + + def _format_main(self): + return "#{self.group_id}".format(self=self) + + +class LocalizationMarker(TagBase): + '''A tag to mark a particular localization site + ''' + __slots__ = () + + def __init__(self, value, extra=None, group_id=None): + assert group_id is not None + super(LocalizationMarker, self).__init__( + TagTypeEnum.localization_marker, float(value), extra, group_id) + + def _format_main(self): + return "#{self.group_id}({self.value!f})".format(self=self) + + +class InformationTag(TagBase): + '''A tag carrying free text describing the location + ''' + __slots__ = () + + prefix_name = "INFO" + + def __init__(self, value, extra=None, group_id=None): + super(InformationTag, self).__init__( + TagTypeEnum.info, str(value), extra, group_id) + + def _format_main(self): + return str(self.value) + + +class MassModification(TagBase): + '''A modification defined purely by a signed mass shift in Daltons. + + The value of a :class:`MassModification` is always a :class:`float` + ''' + __slots__ = () + + def __init__(self, value, extra=None, group_id=None): + super(MassModification, self).__init__( + TagTypeEnum.massmod, float(value), extra, group_id) + + def _format_main(self): + return '%0.4f' % self.value + + +class ModificationBase(TagBase): + '''A base class for all modification tags with marked prefixes. + ''' + + _tag_type = None + __slots__ = () + + def __init__(self, value, extra=None, group_id=None): + super(ModificationBase, self).__init__( + self._tag_type, value, extra, group_id) + + def _format_main(self): + return "{self.prefix_name}:{self.value}".format(self=self) + + def resolve(self): + '''Find the term and return it's properties + ''' + raise NotImplementedError() + + +class FormulaModification(ModificationBase): + prefix_name = "Formula" + + _tag_type = TagTypeEnum.formula + + def resolve(self): + # The handling of fixed isotopes is wrong here as Pyteomics uses a different + # convention. + from pyteomics.mass import Composition + composition = Composition(formula=''.join(self.value.split(" "))) + return { + "mass": composition.mass(), + "composition": composition + } + + +class GlycanModification(ModificationBase): + prefix_name = "Glycan" + + _tag_type = TagTypeEnum.glycan + + +class GenericModification(TagBase): + __slots__ = () + + def __init__(self, value, extra=None, group_id=None): + super(GenericModification, self).__init__( + TagTypeEnum.generic, value, extra, group_id) + + def _format_main(self): + return self.value + + def resolve(self): + '''Find the term, searching through all available vocabularies and + return the first match's properties + ''' + raise NotImplementedError() + + +class UnimodModification(ModificationBase): + __slots__ = () + + prefix_name = "UNIMOD" + short_prefix = "U" + _tag_type = TagTypeEnum.unimod + + +class PSIModModification(ModificationBase): + __slots__ = () + + prefix_name = "MOD" + short_prefix = 'M' + _tag_type = TagTypeEnum.psimod + + +class GNOmeModification(ModificationBase): + __slots__ = () + + prefix_name = "GNO" + # short_prefix = 'G' + _tag_type = TagTypeEnum.gnome + + +class XLMODModification(ModificationBase): + __slots__ = () + + prefix_name = "XLMOD" + # short_prefix = 'XL' + _tag_type = TagTypeEnum.xlmod + + +def split_tags(tokens): + '''Split a token array into discrete sets of tag + tokens. + + Parameters + ---------- + tokens: list + The characters of the tag token buffer + + Returns + ------- + list of list: + The tokens for each contained tag + ''' + starts = [0] + ends = [] + for i, c in enumerate(tokens): + if c == '|': + ends.append(i) + starts.append(i + 1) + elif (i != 0 and c == '#'): + ends.append(i) + starts.append(i) + ends.append(len(tokens)) + out = [] + for i, start in enumerate(starts): + end = ends[i] + out.append(tokens[start:end]) + return out + + +def find_prefix(tokens): + '''Find the prefix, if any of the tag defined by `tokens` + delimited by ":". + + Parameters + ---------- + tokens: list + The tag tokens to search + + Returns + ------- + prefix: str or None + The prefix string, if found + rest: str + The rest of the tokens, merged as a string + ''' + for i, c in enumerate(tokens): + if c == ':': + return ''.join(tokens[:i]), ''.join(tokens[i + 1:]) + return None, ''.join(tokens) + + +def process_marker(tokens): + '''Process a marker, which is a tag whose value starts with #. + + Parameters + ---------- + tokens: list + The tag tokens to parse + + Returns + ------- + PositionLabelTag or LocalizationMarker + ''' + if tokens[1:3] == 'XL': + return PositionLabelTag(None, group_id=''.join(tokens)) + else: + group_id = None + value = None + for i, c in enumerate(tokens): + if c == '(': + group_id = ''.join(tokens[:i]) + if tokens[-1] != ')': + raise Exception( + "Localization marker with score missing closing parenthesis") + value = float(''.join(tokens[i + 1:-1])) + return LocalizationMarker(value, group_id=group_id) + else: + group_id = ''.join(tokens) + return PositionLabelTag(group_id=group_id) + + +def process_tag_tokens(tokens): + '''Convert a tag token buffer into a parsed :class:`TagBase` instance + of the appropriate sub-type with zero or more sub-tags. + + Parameters + ---------- + tokens: list + The tokens to parse + + Returns + ------- + TagBase: + The parsed tag + ''' + parts = split_tags(tokens) + main_tag = parts[0] + if main_tag[0] in ('+', '-'): + main_tag = ''.join(main_tag) + main_tag = MassModification(main_tag) + elif main_tag[0] == '#': + main_tag = process_marker(main_tag) + else: + prefix, value = find_prefix(main_tag) + if prefix is None: + main_tag = GenericModification(''.join(value)) + else: + tag_type = TagBase.find_by_tag(prefix) + main_tag = tag_type(value) + if len(parts) > 1: + extras = [] + for part in parts[1:]: + prefix, value = find_prefix(part) + if prefix is None: + if value[0] == "#": + marker = process_marker(value) + if isinstance(marker, PositionLabelTag): + main_tag.group_id = ''.join(value) + else: + main_tag.group_id = marker.group_id + extras.append(marker) + else: + extras.append(GenericModification(''.join(value))) + else: + tag_type = TagBase.find_by_tag(prefix) + extras.append(tag_type(value)) + main_tag.extra = extras + return main_tag + + +class ModificationRule(object): + '''Define a fixed modification rule which dictates a modification tag is + always applied at one or more amino acid residues. + + Attributes + ---------- + modification_tag: TagBase + The modification to apply + targets: list + The list of amino acids this applies to + ''' + __slots__ = ('modification_tag', 'targets') + + def __init__(self, modification_tag, targets=None): + self.modification_tag = modification_tag + self.targets = targets + + def __eq__(self, other): + if other is None: + return False + return self.modification_tag == other.modification_tag and self.targets == other.targets + + def __ne__(self, other): + return not self == other + + def __str__(self): + targets = ','.join(self.targets) + return "<{self.modification_tag}@{targets}>".format(self=self, targets=targets) + + def __repr__(self): + return "{self.__class__.__name__}({self.modification_tag!r}, {self.targets})".format(self=self) + + +class StableIsotope(object): + '''Define a fixed isotope that is applied globally to all amino acids. + + Attributes + ---------- + isotope: str + The stable isotope string, of the form [] or a special + isotopoform's name. + ''' + __slots__ = ('isotope', ) + + def __init__(self, isotope): + self.isotope = isotope + + def __eq__(self, other): + if other is None: + return False + return self.isotope == other.isotope + + def __ne__(self, other): + return not self == other + + def __str__(self): + return "<{self.isotope}>".format(self=self) + + def __repr__(self): + return "{self.__class__.__name__}({self.isotope})".format(self=self) + + +class TaggedInterval(object): + '''Define a fixed interval over the associated sequence which contains the localization + of the associated tag. + + Attributes + ---------- + start: int + The starting position (inclusive) of the interval along the primary sequence + end: int + The ending position (exclusive) of the interval along the primary sequence + tag: TagBase + The tag being localized + ''' + __slots__ = ('start', 'end', 'tag') + + def __init__(self, start, end=None, tag=None): + self.start = start + self.end = end + self.tag = tag + + def __eq__(self, other): + if other is None: + return False + return self.start == other.start and self.end == other.end and self.tag == other.tag + + def __ne__(self, other): + return not self == other + + def __str__(self): + return "({self.start}-{self.end}){self.tag!r}".format(self=self) + + def __repr__(self): + return "{self.__class__.__name__}({self.start}, {self.end}, {self.tag})".format(self=self) + + +class TagParser(object): + '''A parser which accumulates tokens until it is asked to parse them into + :class:`TagBase` instances. + + Implements a subset of the Sequence protocol. + + Attributes + ---------- + buffer: list + The list of tokens accumulated since the last parsing. + group_ids: set + The set of all group IDs that have been produced so far. + ''' + + def __init__(self, initial=None, group_ids=None): + if initial: + self.buffer = list(initial) + else: + self.buffer = [] + if group_ids: + self.group_ids = set(group_ids) + else: + self.group_ids = set() + + def append(self, c): + '''Append a new character to the buffer. + + Parameters + ---------- + c: str + The character appended + ''' + self.buffer.append(c) + + def reset(self): + '''Discard the content of the current buffer. + ''' + self.buffer = [] + + def __bool__(self): + return bool(self.buffer) + + def __iter__(self): + return iter(self.buffer) + + def __getitem__(self, i): + return self.buffer[i] + + def __len__(self): + return len(self.buffer) + + def process(self): + '''Parse the content of the internal buffer, clear the buffer, + and return the parsed tag. + + Returns + ------- + TagBase + ''' + tag = process_tag_tokens(self.buffer) + if tag.group_id: + self.group_ids.add(tag.group_id) + self.reset() + return tag + + +class ParserStateEnum(Enum): + before_sequence = 0 + tag_before_sequence = 1 + global_tag = 2 + fixed_spec = 3 + labile_tag = 4 + sequence = 5 + tag_in_sequence = 6 + interval_tag = 7 + tag_after_sequence = 8 + stable_isotope = 9 + + done = 999 + + +BEFORE = ParserStateEnum.before_sequence +TAG_BEFORE = ParserStateEnum.tag_before_sequence +FIXED = ParserStateEnum.fixed_spec +GLOBAL = ParserStateEnum.global_tag +ISOTOPE = ParserStateEnum.stable_isotope +LABILE = ParserStateEnum.labile_tag +SEQ = ParserStateEnum.sequence +TAG = ParserStateEnum.tag_in_sequence +INTERVAL_TAG = ParserStateEnum.interval_tag +TAG_AFTER = ParserStateEnum.tag_after_sequence +DONE = ParserStateEnum.done + +VALID_AA = set("QWERTYIPASDFGHKLCVNM") + +def parse_proforma(sequence): + '''Tokenize a ProForma sequence into a sequence of amino acid+tag positions, and a + mapping of sequence-spanning modifiers. + + .. note:: + This is a state machine parser, but with certain sub-state paths + unrolled to avoid an explosion of formal intermediary states. + + Parameters + ---------- + sequence: str + The sequence to parse + + Returns + ------- + parsed_sequence: list + The (amino acid: str, TagBase or None) pairs denoting the positions along the primary sequence + modifiers: dict + A mapping listing the labile modifications, fixed modifications, stable isotopes, unlocalized + modifications, tagged intervals, and group IDs + ''' + labile_modifications = [] + fixed_modifications = [] + unlocalized_modifications = [] + intervals = [] + isotopes = [] + + n_term = None + c_term = None + + i = 0 + n = len(sequence) + + positions = [] + state = BEFORE + depth = 0 + + current_aa = None + current_tag = TagParser() + current_interval = None + + while i < n: + c = sequence[i] + i += 1 + if state == BEFORE: + if c == '[': + state = TAG_BEFORE + depth = 1 + elif c == '{': + state = LABILE + depth = 1 + elif c == '<': + state = FIXED + elif c in VALID_AA: + current_aa = c + state = SEQ + else: + raise Exception("Error In State {state}, unexpected {c} found at index {i}".format(**locals())) + elif state == SEQ: + if c in VALID_AA: + positions.append((current_aa, current_tag.process() if current_tag else None)) + current_aa = c + elif c == '[': + state = TAG + depth = 1 + elif c == '(': + current_interval = TaggedInterval(len(positions) + 1) + elif c == ')': + if current_interval is None: + raise Exception("Error In State {state}, unexpected {c} found at index {i}".format(**locals())) + else: + current_interval.end = len(positions) + 1 + if i >= n or sequence[i] != '[': + raise Exception("Missing Interval Tag") + i += 1 + depth = 1 + state = INTERVAL_TAG + elif c == '-': + state = TAG_AFTER + if i >= n or sequence[i] != '[': + raise Exception("Missing Interval Tag") + i += 1 + depth = 1 + else: + raise Exception("Error In State {state}, unexpected {c} found at index {i}".format(**locals())) + elif state == TAG or state == TAG_BEFORE or state == TAG_AFTER or state == GLOBAL: + if c == '[': + depth += 1 + elif c == ']': + depth -= 1 + if depth <= 0: + depth = 0 + if state == TAG: + state = SEQ + elif state == TAG_BEFORE: + if i < n: + cnext = sequence[i] + if cnext == '?': + unlocalized_modifications.append(current_tag.process()) + i += 1 + elif cnext == '-': + n_term = current_tag.process() + i += 1 + else: + i += 1 + raise Exception("Error In State {state}, unexpected {cnext} found at index {i}".format(**locals())) + + state = BEFORE + elif state == TAG_AFTER: + c_term = current_tag.process() + state = DONE + elif state == GLOBAL: + # Gobble the rest of the global tag inline to avoid spawning + # a whole new state. + if i < n: + c = sequence[i] + i += 1 + if c != '@': + raise Exception( + ("Error In State {state}, fixed modification detected without " + "target amino acids found at index {i}").format(**locals())) + end = 0 + targets = [] + while i < n: + c = sequence[i] + i += 1 + if c in VALID_AA: + targets.append(c) + elif c == ',': + pass + elif '>': + break + else: + raise Exception( + ("Error In State {state}, unclosed fixed modification rule").format(**locals())) + + fixed_modifications.append( + ModificationRule(current_tag.process(), targets)) + state = BEFORE + else: + current_tag.append(c) + elif state == FIXED: + if c == '[': + state = GLOBAL + else: + state = ISOTOPE + current_tag.append(c) + elif state == ISOTOPE: + if c != '>': + current_tag.append(c) + else: + isotopes.append(StableIsotope(''.join(current_tag))) + current_tag.reset() + state = BEFORE + elif state == LABILE: + if c == '{': + depth += 1 + elif c == '}': + depth -= 1 + if depth <= 0: + depth = 0 + labile_modifications.append(current_tag.process()) + state = BEFORE + else: + current_tag.append(c) + elif state == INTERVAL_TAG: + if c == '[': + depth += 1 + elif c == ']': + depth -= 1 + if depth <= 0: + depth = 0 + current_interval.tag = current_tag.process() + intervals.append(current_interval) + current_interval = None + state = SEQ + else: + current_tag.append(c) + else: + raise Exception("Error In State {state}, unexpected {c} found at index {i}".format(**locals())) + if state in (ISOTOPE, TAG, TAG_AFTER, TAG_BEFORE, LABILE, ): + raise Exception("Error In State {state}, unclosed group reached end of string!".format(**locals())) + if current_aa: + positions.append((current_aa, current_tag.process() if current_tag else None)) + return positions, { + 'n_term': n_term, + 'c_term': c_term, + 'unlocalized_modifications': unlocalized_modifications, + 'labile_modifications': labile_modifications, + 'fixed_modifications': fixed_modifications, + 'intervals': intervals, + 'isotopes': isotopes, + 'group_ids': list(current_tag.group_ids) + } From 8c5301edb80a3366f57fc8e166c241c4b3b37b86 Mon Sep 17 00:00:00 2001 From: Joshua Klein Date: Sat, 19 Dec 2020 23:01:00 -0500 Subject: [PATCH 03/27] Updates to the notebook --- proforma_parsing.ipynb | 767 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 676 insertions(+), 91 deletions(-) diff --git a/proforma_parsing.ipynb b/proforma_parsing.ipynb index f87866a1..6ca08254 100644 --- a/proforma_parsing.ipynb +++ b/proforma_parsing.ipynb @@ -2,38 +2,55 @@ "cells": [ { "cell_type": "code", - "execution_count": 62, + "execution_count": 172, "metadata": {}, "outputs": [], "source": [ "import re\n", "from collections import namedtuple, defaultdict\n", - "from enum import Enum\n", "\n", - "from six import add_metaclass" + "try:\n", + " from enum import Enum\n", + "except ImportError:\n", + " # Python 2 doesn't have a builtin Enum type\n", + " Enum = object\n", + "\n", + "from six import add_metaclass\n", + "\n", + "from pyteomics import parser" ] }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 173, "metadata": {}, "outputs": [], "source": [ "class PrefixSavingMeta(type):\n", + " '''A subclass-registering-metaclass that provides easy\n", + " lookup of subclasses by prefix attributes.\n", + " '''\n", + "\n", " def __new__(mcs, name, parents, attrs):\n", " new_type = type.__new__(mcs, name, parents, attrs)\n", " prefix = attrs.get(\"prefix_name\")\n", " if prefix:\n", - " new_type.prefix_map[prefix] = new_type\n", + " new_type.prefix_map[prefix.lower()] = new_type\n", " short = attrs.get(\"short_prefix\")\n", " if short:\n", - " new_type.prefix_map[short] = new_type\n", - " return new_type" + " new_type.prefix_map[short.lower()] = new_type\n", + " return new_type\n", + " \n", + " def find_by_tag(self, tag_name):\n", + " if tag_name is None:\n", + " raise ValueError(\"tag_name cannot be None!\")\n", + " tag_name = tag_name.lower()\n", + " return self.prefix_map[tag_name]" ] }, { "cell_type": "code", - "execution_count": 102, + "execution_count": 227, "metadata": {}, "outputs": [], "source": [ @@ -44,15 +61,35 @@ " generic = 3\n", " info = 4\n", " gnome = 5\n", - " formula = 6\n", - " glycan = 7\n", - " xlmod = 8\n", + " xlmod = 6\n", + "\n", + " formula = 7\n", + " glycan = 8\n", + "\n", " localization_marker = 9\n", + " position_label = 10\n", " group_placeholder = 999\n", - " \n", + "\n", + "\n", + "_sentinel = object()\n", + "\n", "\n", "@add_metaclass(PrefixSavingMeta)\n", "class TagBase(object):\n", + " '''A base class for all tag types.\n", + "\n", + " Attributes\n", + " ----------\n", + " type: Enum\n", + " An element of :class:`TagTypeEnum` saying what kind of tag this is.\n", + " value: object\n", + " The data stored in this tag, usually an externally controlled name\n", + " extra: list\n", + " Any extra tags that were nested within this tag. Usually limited to INFO\n", + " tags but may be other synonymous controlled vocabulary terms.\n", + " group_id: str or None\n", + " A short label denoting which group, if any, this tag belongs to\n", + " '''\n", " __slots__ = (\"type\", \"value\", \"extra\", \"group_id\")\n", "\n", " prefix_name = None\n", @@ -62,7 +99,7 @@ " def __init__(self, type, value, extra=None, group_id=None):\n", " self.type = type\n", " self.value = value\n", - " self.extra = extra or []\n", + " self.extra = extra\n", " self.group_id = group_id\n", "\n", " def __str__(self):\n", @@ -73,15 +110,49 @@ " else:\n", " label = part\n", " if self.group_id:\n", - " label = '%s#%s' % (label, self.group_id)\n", - " return label\n", + " label = '%s%s' % (label, self.group_id)\n", + " return '%s' % label\n", " \n", " def __repr__(self):\n", " template = \"{self.__class__.__name__}({self.value!r}, {self.extra!r}, {self.group_id!r})\"\n", " return template.format(self=self)\n", + " \n", + " def __eq__(self, other):\n", + " if other is None:\n", + " return False\n", + " return (self.type == other.type) and (self.value == other.value) and (self.extra == other.extra) \\\n", + " and (self.group_id == other.group_id)\n", + "\n", + " def __ne__(self, other):\n", + " return not self == other\n", + "\n", + " def find_extra(self, label):\n", + " out = []\n", + " if not self.extra:\n", + " return out\n", + " for e in self.extra:\n", + " if e.type == label:\n", + " out.append(e)\n", + " return out\n", + "\n", + "\n", + "class PositionLabelTag(TagBase):\n", + " '''A tag to mark that a position is involved in a group in some way, but does\n", + " not imply any specific semantics.\n", + " '''\n", + " __slots__ = ()\n", + "\n", + " def __init__(self, value=None, extra=None, group_id=None):\n", + " assert group_id is not None\n", + " super(PositionLabelTag, self).__init__(TagTypeEnum.position_label, group_id, extra, group_id)\n", + " \n", + " def _format_main(self):\n", + " return \"#{self.group_id}\".format(self=self)\n", "\n", "\n", "class LocalizationMarker(TagBase):\n", + " '''A tag to mark a particular localization site \n", + " '''\n", " __slots__ = ()\n", " \n", " def __init__(self, value, extra=None, group_id=None):\n", @@ -90,9 +161,27 @@ " \n", " def _format_main(self):\n", " return \"#{self.group_id}({self.value!f})\".format(self=self)\n", - " \n", - " \n", + "\n", + "\n", + "class InformationTag(TagBase):\n", + " '''A tag carrying free text describing the location\n", + " '''\n", + " __slots__ = ()\n", + "\n", + " prefix_name = \"INFO\"\n", + "\n", + " def __init__(self, value, extra=None, group_id=None):\n", + " super(InformationTag, self).__init__(TagTypeEnum.info, str(value), extra, group_id)\n", + "\n", + " def _format_main(self):\n", + " return str(self.value)\n", + "\n", + "\n", "class MassModification(TagBase):\n", + " '''A modification defined purely by a signed mass shift in Daltons.\n", + "\n", + " The value of a :class:`MassModification` is always a :class:`float`\n", + " '''\n", " __slots__ = ()\n", " \n", " def __init__(self, value, extra=None, group_id=None):\n", @@ -101,17 +190,48 @@ " def _format_main(self):\n", " return '%0.4f' % self.value\n", "\n", + "\n", " \n", - "class ControlledVocabularyModificationBase(TagBase):\n", + "class ModificationBase(TagBase):\n", + " '''A base class for all modification tags with marked prefixes.\n", + " '''\n", + "\n", " _tag_type = None\n", " __slots__ = ()\n", " \n", " def __init__(self, value, extra=None, group_id=None):\n", - " super(ControlledVocabularyModificationBase, self).__init__(\n", + " super(ModificationBase, self).__init__(\n", " self._tag_type, value, extra, group_id)\n", "\n", " def _format_main(self):\n", " return \"{self.prefix_name}:{self.value}\".format(self=self)\n", + " \n", + " def resolve(self):\n", + " '''Find the term and return it's properties\n", + " '''\n", + " raise NotImplementedError()\n", + "\n", + "\n", + "class FormulaModification(ModificationBase):\n", + " prefix_name = \"Formula\"\n", + "\n", + " _tag_type = TagTypeEnum.formula\n", + "\n", + " def resolve(self):\n", + " # The handling of fixed isotopes is wrong here as Pyteomics uses a different\n", + " # convention.\n", + " from pyteomics.mass import Composition\n", + " composition = Composition(formula=''.join(self.value.split(\" \")))\n", + " return {\n", + " \"mass\": composition.mass(),\n", + " \"composition\": composition\n", + " }\n", + "\n", + "\n", + "class GlycanModification(ModificationBase):\n", + " prefix_name = \"Glycan\"\n", + "\n", + " _tag_type = TagTypeEnum.glycan\n", "\n", " \n", "class GenericModification(TagBase):\n", @@ -122,9 +242,15 @@ " \n", " def _format_main(self):\n", " return self.value\n", - " \n", "\n", - "class UnimodModification(ControlledVocabularyModificationBase):\n", + " def resolve(self):\n", + " '''Find the term, searching through all available vocabularies and\n", + " return the first match's properties\n", + " '''\n", + " raise NotImplementedError()\n", + "\n", + "\n", + "class UnimodModification(ModificationBase):\n", " __slots__ = ()\n", " \n", " prefix_name = \"UNIMOD\"\n", @@ -132,7 +258,7 @@ " _tag_type = TagTypeEnum.unimod\n", "\n", "\n", - "class PSIModModification(ControlledVocabularyModificationBase):\n", + "class PSIModModification(ModificationBase):\n", " __slots__ = ()\n", " \n", " prefix_name = \"MOD\"\n", @@ -140,33 +266,45 @@ " _tag_type = TagTypeEnum.psimod\n", "\n", "\n", - "class GNOmeModification(ControlledVocabularyModificationBase):\n", + "class GNOmeModification(ModificationBase):\n", " __slots__ = ()\n", " \n", " prefix_name = \"GNO\"\n", - " short_prefix = 'G'\n", + " # short_prefix = 'G'\n", " _tag_type = TagTypeEnum.gnome\n", "\n", " \n", - "class XLMODModification(ControlledVocabularyModificationBase):\n", + "class XLMODModification(ModificationBase):\n", " __slots__ = ()\n", " \n", " prefix_name = \"XLMOD\"\n", - "# short_prefix = 'XL'\n", + " # short_prefix = 'XL'\n", " _tag_type = TagTypeEnum.xlmod\n", - " \n", - " \n", - "class TagParserStateEnum(Enum):\n", - " start = 0\n", - " group_id = 1\n", + "\n", "\n", "def split_tags(tokens):\n", + " '''Split a token array into discrete sets of tag\n", + " tokens.\n", + "\n", + " Parameters\n", + " ----------\n", + " tokens: list\n", + " The characters of the tag token buffer\n", + " \n", + " Returns\n", + " -------\n", + " list of list:\n", + " The tokens for each contained tag\n", + " '''\n", " starts = [0]\n", " ends = []\n", " for i, c in enumerate(tokens):\n", " if c == '|':\n", " ends.append(i)\n", " starts.append(i + 1)\n", + " elif (i != 0 and c == '#'):\n", + " ends.append(i)\n", + " starts.append(i)\n", " ends.append(len(tokens))\n", " out = []\n", " for i, start in enumerate(starts):\n", @@ -174,64 +312,292 @@ " out.append(tokens[start:end])\n", " return out\n", "\n", + "\n", "def find_prefix(tokens):\n", + " '''Find the prefix, if any of the tag defined by `tokens`\n", + " delimited by \":\".\n", + "\n", + " Parameters\n", + " ----------\n", + " tokens: list\n", + " The tag tokens to search\n", + " \n", + " Returns\n", + " -------\n", + " prefix: str or None\n", + " The prefix string, if found\n", + " rest: str\n", + " The rest of the tokens, merged as a string\n", + " '''\n", " for i, c in enumerate(tokens):\n", " if c == ':':\n", " return ''.join(tokens[:i]), ''.join(tokens[i + 1:])\n", " return None, tokens\n", - " \n", + "\n", + "def process_marker(tokens):\n", + " '''Process a marker, which is a tag whose value starts with #.\n", + "\n", + " Parameters\n", + " ----------\n", + " tokens: list\n", + " The tag tokens to parse\n", + "\n", + " Returns\n", + " -------\n", + " PositionLabelTag or LocalizationMarker\n", + " '''\n", + " if tokens[1:3] == 'XL':\n", + " return PositionLabelTag(None, group_id=''.join(tokens))\n", + " else:\n", + " group_id = None\n", + " value = None\n", + " for i, c in enumerate(tokens):\n", + " if c == '(':\n", + " group_id = ''.join(tokens[:i])\n", + " if tokens[-1] != ')':\n", + " raise Exception(\"Localization marker with score missing closing parenthesis\")\n", + " value = float(''.join(tokens[i + 1:-1]))\n", + " return LocalizationMarker(value, group_id=group_id)\n", + " else:\n", + " group_id = ''.join(tokens)\n", + " return PositionLabelTag(group_id=group_id)\n", + " \n", + "\n", + "\n", "def process_tag_tokens(tokens):\n", + " '''Convert a tag token buffer into a parsed :class:`TagBase` instance\n", + " of the appropriate sub-type with zero or more sub-tags.\n", + "\n", + " Parameters\n", + " ----------\n", + " tokens: list\n", + " The tokens to parse\n", + " \n", + " Returns\n", + " -------\n", + " TagBase:\n", + " The parsed tag\n", + " '''\n", " parts = split_tags(tokens)\n", " main_tag = parts[0]\n", " if main_tag[0] in ('+', '-'):\n", " main_tag = ''.join(main_tag)\n", " main_tag = MassModification(main_tag)\n", + " elif main_tag[0] == '#':\n", + " main_tag = process_marker(main_tag)\n", " else:\n", " prefix, value = find_prefix(main_tag)\n", " if prefix is None:\n", " main_tag = GenericModification(''.join(value))\n", " else:\n", - " tag_type = TagBase.prefix_map[prefix]\n", + " tag_type = TagBase.find_by_tag(prefix)\n", " main_tag = tag_type(value)\n", " if len(parts) > 1:\n", " extras = []\n", - " for part in parts:\n", + " for part in parts[1:]:\n", " prefix, value = find_prefix(part)\n", " if prefix is None:\n", - " if value.startswith(\"#\"):\n", - " main_tag.group_id = value\n", + " if value[0] == \"#\":\n", + " marker = process_marker(value)\n", + " if isinstance(marker, PositionLabelTag):\n", + " main_tag.group_id = ''.join(value)\n", + " else:\n", + " main_tag.group_id = marker.group_id\n", + " extras.append(marker)\n", " else:\n", - " main_tag.extra.append(GenericModification(''.join(value)))\n", + " extras.append(GenericModification(''.join(value)))\n", " else:\n", - " tag_type = TagBase.prefix_map[prefix]\n", - " main_tag.extra.append(tag_type(value))\n", + " tag_type = TagBase.find_by_tag(prefix)\n", + " extras.append(tag_type(value))\n", + " main_tag.extra = extras\n", " return main_tag" ] }, { "cell_type": "code", - "execution_count": 103, + "execution_count": 228, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'QWERTYIPASDFGHKLCVNM'" - ] - }, - "execution_count": 103, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "from pyteomics import parser\n", - "''.join(parser.std_amino_acids)" + "class ModificationRule(object):\n", + " '''Define a fixed modification rule which dictates a modification tag is\n", + " always applied at one or more amino acid residues.\n", + "\n", + " Attributes\n", + " ----------\n", + " modification_tag: TagBase\n", + " The modification to apply\n", + " targets: list\n", + " The list of amino acids this applies to\n", + " '''\n", + " __slots__ = ('modification_tag', 'targets')\n", + "\n", + " def __init__(self, modification_tag, targets=None):\n", + " self.modification_tag = modification_tag\n", + " self.targets = targets\n", + "\n", + " def __eq__(self, other):\n", + " if other is None:\n", + " return False\n", + " return self.modification_tag == other.modification_tag and self.targets == other.targets\n", + "\n", + " def __ne__(self, other):\n", + " return not self == other\n", + "\n", + " def __str__(self):\n", + " targets = ','.join(self.targets)\n", + " return \"<{self.modification_tag}@{targets}>\".format(self=self, targets=targets)\n", + "\n", + " def __repr__(self):\n", + " return \"{self.__class__.__name__}({self.modification_tag!r}, {self.targets})\".format(self=self)\n", + "\n", + "\n", + "class StableIsotope(object):\n", + " '''Define a fixed isotope that is applied globally to all amino acids.\n", + "\n", + " Attributes\n", + " ----------\n", + " isotope: str\n", + " The stable isotope string, of the form [] or a special\n", + " isotopoform's name.\n", + " '''\n", + " __slots__ = ('isotope', )\n", + "\n", + " def __init__(self, isotope):\n", + " self.isotope = isotope\n", + "\n", + " def __eq__(self, other):\n", + " if other is None:\n", + " return False\n", + " return self.isotope == other.isotope\n", + "\n", + " def __ne__(self, other):\n", + " return not self == other\n", + "\n", + " def __str__(self):\n", + " return \"<{self.isotope}>\".format(self=self)\n", + "\n", + " def __repr__(self):\n", + " return \"{self.__class__.__name__}({self.isotope})\".format(self=self)\n", + "\n", + "\n", + "class TaggedInterval(object):\n", + " '''Define a fixed interval over the associated sequence which contains the localization\n", + " of the associated tag.\n", + "\n", + " Attributes\n", + " ----------\n", + " start: int\n", + " The starting position (inclusive) of the interval along the primary sequence\n", + " end: int\n", + " The ending position (exclusive) of the interval along the primary sequence\n", + " tag: TagBase\n", + " The tag being localized\n", + " '''\n", + " __slots__ = ('start', 'end', 'tag')\n", + "\n", + " def __init__(self, start, end=None, tag=None):\n", + " self.start = start\n", + " self.end = end\n", + " self.tag = tag\n", + " \n", + " def __eq__(self, other):\n", + " if other is None:\n", + " return False\n", + " return self.start == other.start and self.end == other.end and self.tag == other.tag\n", + "\n", + " def __ne__(self, other):\n", + " return not self == other\n", + "\n", + " def __str__(self):\n", + " return \"({self.start}-{self.end}){self.tag!r}\".format(self=self)\n", + "\n", + " def __repr__(self):\n", + " return \"{self.__class__.__name__}({self.start}, {self.end}, {self.tag})\".format(self=self)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 229, + "metadata": {}, + "outputs": [], + "source": [ + "class TagParser(object):\n", + " '''A parser which accumulates tokens until it is asked to parse them into\n", + " :class:`TagBase` instances.\n", + "\n", + " Implements a subset of the Sequence protocol.\n", + "\n", + " Attributes\n", + " ----------\n", + " buffer: list\n", + " The list of tokens accumulated since the last parsing.\n", + " group_ids: set\n", + " The set of all group IDs that have been produced so far.\n", + " '''\n", + "\n", + " def __init__(self, initial=None, group_ids=None):\n", + " if initial:\n", + " self.buffer = list(initial)\n", + " else:\n", + " self.buffer = []\n", + " if group_ids:\n", + " self.group_ids = set(group_ids)\n", + " else:\n", + " self.group_ids = set()\n", + " \n", + " def append(self, c):\n", + " '''Append a new character to the buffer.\n", + "\n", + " Parameters\n", + " ----------\n", + " c: str\n", + " The character appended\n", + " '''\n", + " self.buffer.append(c)\n", + " \n", + " def reset(self):\n", + " '''Discard the content of the current buffer.\n", + " '''\n", + " self.buffer = []\n", + " \n", + " def __bool__(self):\n", + " return bool(self.buffer)\n", + " \n", + " def __iter__(self):\n", + " return iter(self.buffer)\n", + "\n", + " def __getitem__(self, i):\n", + " return self.buffer[i]\n", + " \n", + " def __len__(self):\n", + " return len(self.buffer)\n", + "\n", + " def process(self):\n", + " '''Parse the content of the internal buffer, clear the buffer,\n", + " and return the parsed tag.\n", + "\n", + " Returns\n", + " -------\n", + " TagBase\n", + " '''\n", + " tag = process_tag_tokens(self.buffer)\n", + " if tag.group_id:\n", + " self.group_ids.add(tag.group_id)\n", + " self.reset()\n", + " return tag" ] }, { "cell_type": "code", - "execution_count": 104, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 230, "metadata": {}, "outputs": [], "source": [ @@ -245,7 +611,8 @@ " tag_in_sequence = 6\n", " interval_tag = 7\n", " tag_after_sequence = 8\n", - " \n", + " stable_isotope = 9\n", + "\n", " done = 999\n", "\n", "\n", @@ -253,6 +620,7 @@ "TAG_BEFORE = ParserStateEnum.tag_before_sequence\n", "FIXED = ParserStateEnum.fixed_spec\n", "GLOBAL = ParserStateEnum.global_tag\n", + "ISOTOPE = ParserStateEnum.stable_isotope\n", "LABILE = ParserStateEnum.labile_tag\n", "SEQ = ParserStateEnum.sequence\n", "TAG = ParserStateEnum.tag_in_sequence\n", @@ -262,7 +630,27 @@ "\n", "VALID_AA = set(\"QWERTYIPASDFGHKLCVNM\")\n", "\n", - "def tokenize_proforma(sequence):\n", + "def parse_proforma(sequence):\n", + " '''Tokenize a ProForma sequence into a sequence of amino acid+tag positions, and a\n", + " mapping of sequence-spanning modifiers.\n", + "\n", + " .. note::\n", + " This is a state machine parser, but with certain sub-state paths\n", + " unrolled to avoid an explosion of formal intermediary states.\n", + "\n", + " Parameters\n", + " ----------\n", + " sequence: str\n", + " The sequence to parse\n", + " \n", + " Returns\n", + " -------\n", + " parsed_sequence: list\n", + " The (amino acid: str, TagBase or None) pairs denoting the positions along the primary sequence\n", + " modifiers: dict\n", + " A mapping listing the labile modifications, fixed modifications, stable isotopes, unlocalized\n", + " modifications, tagged intervals, and group IDs\n", + " '''\n", " labile_modifications = []\n", " fixed_modifications = []\n", " unlocalized_modifications = []\n", @@ -280,7 +668,7 @@ " depth = 0\n", " \n", " current_aa = None\n", - " current_tag = []\n", + " current_tag = TagParser()\n", " current_interval = None\n", " \n", " while i < n:\n", @@ -298,35 +686,22 @@ " elif c in VALID_AA:\n", " current_aa = c\n", " state = SEQ\n", - " elif c == '?':\n", - " if current_tag:\n", - " unlocalized_modifications.append(process_tag_tokens(current_tag))\n", - " current_tag = []\n", - " else:\n", - " raise Exception(\"Error In State {state}, unexpected {c} found at index {i}\".format(**locals()))\n", - " elif c == '-':\n", - " if current_tag:\n", - " n_term = process_tag_tokens(current_tag)\n", - " current_tag = []\n", - " else:\n", - " raise Exception(\"Error In State {state}, unexpected {c} found at index {i}\".format(**locals()))\n", " else:\n", " raise Exception(\"Error In State {state}, unexpected {c} found at index {i}\".format(**locals()))\n", " elif state == SEQ:\n", " if c in VALID_AA:\n", - " positions.append((current_aa, process_tag_tokens(current_tag) if current_tag else None))\n", + " positions.append((current_aa, current_tag.process() if current_tag else None))\n", " current_aa = c\n", - " current_tag = []\n", " elif c == '[':\n", " state = TAG\n", " depth = 1\n", " elif c == '(':\n", - " current_interval = [len(positions), None, None]\n", + " current_interval = TaggedInterval(len(positions) + 1)\n", " elif c == ')':\n", " if current_interval is None:\n", " raise Exception(\"Error In State {state}, unexpected {c} found at index {i}\".format(**locals()))\n", " else:\n", - " current_interval[1] = len(positions)\n", + " current_interval.end = len(positions) + 1\n", " if i >= n or sequence[i] != '[':\n", " raise Exception(\"Missing Interval Tag\")\n", " i += 1\n", @@ -340,7 +715,7 @@ " depth = 1 \n", " else:\n", " raise Exception(\"Error In State {state}, unexpected {c} found at index {i}\".format(**locals()))\n", - " elif state == TAG or state == TAG_BEFORE or state == TAG_AFTER:\n", + " elif state == TAG or state == TAG_BEFORE or state == TAG_AFTER or state == GLOBAL:\n", " if c == '[':\n", " depth += 1\n", " elif c == ']':\n", @@ -350,12 +725,65 @@ " if state == TAG: \n", " state = SEQ\n", " elif state == TAG_BEFORE:\n", + " if i < n:\n", + " cnext = sequence[i]\n", + " if cnext == '?':\n", + " unlocalized_modifications.append(current_tag.process())\n", + " i += 1\n", + " elif cnext == '-':\n", + " n_term = current_tag.process()\n", + " i += 1\n", + " else:\n", + " i += 1\n", + " raise Exception(\"Error In State {state}, unexpected {cnext} found at index {i}\".format(**locals()))\n", + "\n", " state = BEFORE\n", " elif state == TAG_AFTER:\n", - " c_term = process_tag_tokens(current_tag)\n", + " c_term = current_tag.process()\n", " state = DONE\n", + " elif state == GLOBAL:\n", + " # Gobble the rest of the global tag inline to avoid spawning\n", + " # a whole new state.\n", + " if i < n:\n", + " c = sequence[i]\n", + " i += 1\n", + " if c != '@':\n", + " raise Exception(\n", + " (\"Error In State {state}, fixed modification detected without \"\n", + " \"target amino acids found at index {i}\").format(**locals()))\n", + " end = 0\n", + " targets = []\n", + " while i < n:\n", + " c = sequence[i]\n", + " i += 1\n", + " if c in VALID_AA:\n", + " targets.append(c)\n", + " elif c == ',':\n", + " pass\n", + " elif '>':\n", + " break\n", + " else:\n", + " raise Exception(\n", + " (\"Error In State {state}, unclosed fixed modification rule\").format(**locals()))\n", + "\n", + " fixed_modifications.append(\n", + " ModificationRule(current_tag.process(), targets))\n", + " state = BEFORE\n", + " else:\n", + " current_tag.append(c)\n", + " elif state == FIXED:\n", + " if c == '[':\n", + " state = GLOBAL\n", " else:\n", + " state = ISOTOPE\n", " current_tag.append(c)\n", + " elif state == ISOTOPE:\n", + " if c != '>':\n", + " current_tag.append(c)\n", + " else:\n", + " isotopes.append(StableIsotope(''.join(current_tag)))\n", + " current_tag.reset()\n", + " state = BEFORE\n", " elif state == LABILE:\n", " if c == '{':\n", " depth += 1\n", @@ -363,8 +791,7 @@ " depth -= 1\n", " if depth <= 0:\n", " depth = 0\n", - " labile_modifications.append(process_tag_tokens(current_tag))\n", - " current_tag = []\n", + " labile_modifications.append(current_tag.process())\n", " state = BEFORE\n", " else:\n", " current_tag.append(c)\n", @@ -375,8 +802,7 @@ " depth -= 1\n", " if depth <= 0:\n", " depth = 0\n", - " current_interval[2] = process_tag_tokens(current_tag)\n", - " current_tag = []\n", + " current_interval.tag = current_tag.process()\n", " intervals.append(current_interval)\n", " current_interval = None\n", " state = SEQ\n", @@ -384,50 +810,209 @@ " current_tag.append(c)\n", " else:\n", " raise Exception(\"Error In State {state}, unexpected {c} found at index {i}\".format(**locals()))\n", + " if state in (ISOTOPE, TAG, TAG_AFTER, TAG_BEFORE, LABILE, ):\n", + " raise Exception(\"Error In State {state}, unclosed group reached end of string!\".format(**locals()))\n", " if current_aa:\n", - " positions.append((current_aa, process_tag_tokens(current_tag) if current_tag else None))\n", + " positions.append((current_aa, current_tag.process() if current_tag else None))\n", " return positions, {\n", " 'n_term': n_term,\n", " 'c_term': c_term,\n", " 'unlocalized_modifications': unlocalized_modifications,\n", " 'labile_modifications': labile_modifications,\n", + " 'fixed_modifications': fixed_modifications,\n", " 'intervals': intervals,\n", + " 'isotopes': isotopes,\n", + " 'group_ids': list(current_tag.group_ids)\n", " }" ] }, { "cell_type": "code", - "execution_count": 110, + "execution_count": 231, "metadata": {}, "outputs": [ { + "output_type": "execute_result", "data": { "text/plain": [ "([('S', None),\n", - " ('T', UnimodModification('Ox', [], None)),\n", + " ('T', UnimodModification('Ox', None, None)),\n", " ('E', None),\n", " ('P', None),\n", " ('P', None),\n", " ('I', None),\n", " ('N', None),\n", " ('G', None)],\n", - " {'n_term': GenericModification('Hex', [], None),\n", + " {'n_term': GenericModification('Hex', None, None),\n", + " 'c_term': None,\n", + " 'unlocalized_modifications': [GenericModification('Bar', None, None)],\n", + " 'labile_modifications': [GenericModification('Foo', None, None)],\n", + " 'fixed_modifications': [ModificationRule(GenericModification('Carbamidomethyl', None, None), ['C'])],\n", + " 'intervals': [TaggedInterval(2, 5, 18.0000)],\n", + " 'isotopes': [StableIsotope(13C)],\n", + " 'group_ids': []})" + ] + }, + "metadata": {}, + "execution_count": 231 + } + ], + "source": [ + "seq, fields = tokenize_proforma(\"<[Carbamidomethyl]@C><13C>[Bar]?{Foo}[Hex]-ST[U:Ox](EPP)[+18]ING\")\n", + "seq, fields" + ] + }, + { + "cell_type": "code", + "execution_count": 232, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "([('S', None),\n", + " ('E', None),\n", + " ('P', None),\n", + " ('P', None),\n", + " ('I', None),\n", + " ('N', None),\n", + " ('G', None)],\n", + " {'n_term': None,\n", " 'c_term': None,\n", " 'unlocalized_modifications': [],\n", - " 'labile_modifications': [GenericModification('Foo', [], None)],\n", - " 'intervals': [[1, 4, MassModification(18.0, [], None)]]})" + " 'labile_modifications': [],\n", + " 'fixed_modifications': [],\n", + " 'intervals': [TaggedInterval(1, 4, 18.0000)],\n", + " 'isotopes': [],\n", + " 'group_ids': []})" ] }, - "execution_count": 110, "metadata": {}, - "output_type": "execute_result" + "execution_count": 232 } ], "source": [ - "seq, fields = tokenize_proforma(\"{Foo}[Hex]-ST[U:Ox](EPP)[+18]ING\")\n", + "seq, fields = tokenize_proforma(\"S(EPP)[+18]ING\")\n", "seq, fields" ] }, + { + "cell_type": "code", + "execution_count": 233, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "([('E', None),\n", + " ('M', GenericModification('Oxidation', None, None)),\n", + " ('E', None),\n", + " ('V', None),\n", + " ('T', LocalizationMarker(0.01, None, '#s1')),\n", + " ('S', LocalizationMarker(0.09, None, '#s1')),\n", + " ('E', None),\n", + " ('S', LocalizationMarker(0.9, None, '#s1')),\n", + " ('P', None),\n", + " ('E', None),\n", + " ('K', None)],\n", + " {'n_term': None,\n", + " 'c_term': None,\n", + " 'unlocalized_modifications': [GenericModification('Phospho', [], '#s1')],\n", + " 'labile_modifications': [],\n", + " 'fixed_modifications': [],\n", + " 'intervals': [],\n", + " 'isotopes': [],\n", + " 'group_ids': ['#s1']})" + ] + }, + "metadata": {}, + "execution_count": 233 + } + ], + "source": [ + "parse_proforma(\"[Phospho#s1]?EM[Oxidation]EVT[#s1(0.01)]S[#s1(0.09)]ES[#s1(0.90)]PEK\")" + ] + }, + { + "cell_type": "code", + "execution_count": 234, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "([('E', None),\n", + " ('M', GenericModification('Oxidation', None, None)),\n", + " ('E', None),\n", + " ('V', None),\n", + " ('T', LocalizationMarker(0.01, None, '#g1')),\n", + " ('S', LocalizationMarker(0.09, None, '#g1')),\n", + " ('E', None),\n", + " ('S',\n", + " GenericModification('Phospho', [LocalizationMarker(0.9, None, '#g1')], '#g1')),\n", + " ('P', None),\n", + " ('E', None),\n", + " ('K', None)],\n", + " {'n_term': None,\n", + " 'c_term': None,\n", + " 'unlocalized_modifications': [],\n", + " 'labile_modifications': [],\n", + " 'fixed_modifications': [],\n", + " 'intervals': [],\n", + " 'isotopes': [],\n", + " 'group_ids': ['#g1']})" + ] + }, + "metadata": {}, + "execution_count": 234 + } + ], + "source": [ + "tokenize_proforma(\"EM[Oxidation]EVT[#g1(0.01)]S[#g1(0.09)]ES[Phospho#g1(0.90)]PEK\")" + ] + }, + { + "cell_type": "code", + "execution_count": 235, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "([('E', None),\n", + " ('M', None),\n", + " ('E', None),\n", + " ('V', None),\n", + " ('T', LocalizationMarker(0.01, None, '#g1')),\n", + " ('S', LocalizationMarker(0.09, None, '#g1')),\n", + " ('E', None),\n", + " ('S',\n", + " GlycanModification('HexNAc 1', [LocalizationMarker(0.9, None, '#g1')], '#g1')),\n", + " ('P', None),\n", + " ('E', None),\n", + " ('K', None)],\n", + " {'n_term': None,\n", + " 'c_term': None,\n", + " 'unlocalized_modifications': [],\n", + " 'labile_modifications': [],\n", + " 'fixed_modifications': [],\n", + " 'intervals': [],\n", + " 'isotopes': [],\n", + " 'group_ids': ['#g1']})" + ] + }, + "metadata": {}, + "execution_count": 235 + } + ], + "source": [ + "tokenize_proforma(\"EMEVT[#g1(0.01)]S[#g1(0.09)]ES[Glycan:HexNAc 1#g1(0.90)]PEK\")" + ] + }, { "cell_type": "code", "execution_count": null, @@ -452,9 +1037,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.8.5-final" } }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file From 2dbe1c819c3663ea75e21a54223bb7e128e76c3e Mon Sep 17 00:00:00 2001 From: Joshua Klein Date: Wed, 30 Dec 2020 23:57:30 -0500 Subject: [PATCH 04/27] Add cv resolver --- .gitignore | 3 +- pyteomics/proforma.py | 341 +++++++++++++++++++++++++++++++----------- 2 files changed, 258 insertions(+), 86 deletions(-) diff --git a/.gitignore b/.gitignore index 382ed2c6..156bb9dd 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,5 @@ __pycache__ *.egg-info *.pyc -.ipynb_checkpoints \ No newline at end of file +.ipynb_checkpoints +.vscode \ No newline at end of file diff --git a/pyteomics/proforma.py b/pyteomics/proforma.py index 26ea3355..c98d29a9 100644 --- a/pyteomics/proforma.py +++ b/pyteomics/proforma.py @@ -39,6 +39,16 @@ from pyteomics import parser from pyteomics.mass import Composition +from pyteomics.auxiliary import PyteomicsError +from pyteomics.mass import Unimod + + +class ProFormaError(PyteomicsError): + def __init__(self, message, index=None, parser_state=None, **kwargs): + super(ProFormaError, self).__init__(PyteomicsError, message, index, parser_state) + self.message = message + self.index = index + self.parser_state = parser_state class PrefixSavingMeta(type): @@ -119,7 +129,7 @@ def __str__(self): else: label = part if self.group_id: - label = '%s%s' % (label, self.group_id) + label = '%s|%s' % (label, self.group_id) return '%s' % label def __repr__(self): @@ -204,24 +214,113 @@ def _format_main(self): return '%0.4f' % self.value +class ModificationResolver(object): + def __init__(self, name, *args, **kwargs): + self.name = name + + def resolve(self, name=None, id=None, **kwargs): + raise NotImplementedError() + + def __call__(self, name=None, id=None, **kwargs): + return self.resolve(name, id, **kwargs) + + +class UnimodResolver(ModificationResolver): + def __init__(self, *args, **kwargs): + super(UnimodResolver, self).__init__("unimod", *args, **kwargs) + self._database = kwargs.get("database" ) + + @property + def database(self): + if not self._database: + self._database = Unimod() + return self._database + + def resolve(self, name=None, id=None, **kwargs): + if name is not None: + defn = self.database.by_title(name) + if not defn: + defn = self.database.by_name(name) + if not defn: + raise KeyError(name) + elif id is not None: + defn = self.database.by_id(id) + if not defn: + raise KeyError(id) + else: + raise ValueError("Must provide one of `name` or `id`") + return { + 'composition': defn['composition'], + 'name': defn['title'], + 'id': defn['record_id'], + 'mass': defn['mono_mass'], + 'provider': self.name + } + + class ModificationBase(TagBase): '''A base class for all modification tags with marked prefixes. ''' _tag_type = None - __slots__ = () + __slots__ = ('_definition', ) def __init__(self, value, extra=None, group_id=None): super(ModificationBase, self).__init__( self._tag_type, value, extra, group_id) + self._definition = None + + @property + def definition(self): + if self._definition is None: + self._definition = self.resolve() + return self._definition + + @property + def mass(self): + return self.definition['mass'] + + @property + def composition(self): + return self.definition.get('composition') + + @property + def id(self): + return self.definition.get('id') + + @property + def name(self): + return self.definition.get('name') + + @property + def provider(self): + return self.definition.get('provider') + + def _populate_from_definition(self, definition): + self._definition = definition def _format_main(self): return "{self.prefix_name}:{self.value}".format(self=self) + def _parse_identifier(self): + tokens = self.value.split(":", 1) + if len(tokens) > 1: + value = tokens[1] + else: + value = self.value + if value.isdigit(): + id = int(value) + name = None + else: + name = value + id = None + return name, id + def resolve(self): '''Find the term and return it's properties ''' - raise NotImplementedError() + keys = self._parse_identifier() + return self.resolver(*keys) class FormulaModification(ModificationBase): @@ -236,7 +335,8 @@ def resolve(self): composition = Composition(formula=''.join(self.value.split(" "))) return { "mass": composition.mass(), - "composition": composition + "composition": composition, + "name": self.value } @@ -246,26 +346,11 @@ class GlycanModification(ModificationBase): _tag_type = TagTypeEnum.glycan -class GenericModification(TagBase): - __slots__ = () - - def __init__(self, value, extra=None, group_id=None): - super(GenericModification, self).__init__( - TagTypeEnum.generic, value, extra, group_id) - - def _format_main(self): - return self.value - - def resolve(self): - '''Find the term, searching through all available vocabularies and - return the first match's properties - ''' - raise NotImplementedError() - - class UnimodModification(ModificationBase): __slots__ = () + resolver = UnimodResolver() + prefix_name = "UNIMOD" short_prefix = "U" _tag_type = TagTypeEnum.unimod @@ -295,6 +380,33 @@ class XLMODModification(ModificationBase): _tag_type = TagTypeEnum.xlmod +class GenericModification(ModificationBase): + __slots__ = () + _tag_type = TagTypeEnum.generic + + def __init__(self, value, extra=None, group_id=None): + super(GenericModification, self).__init__( + value, extra, group_id) + + def _format_main(self): + return self.value + + def resolve(self): + '''Find the term, searching through all available vocabularies and + return the first match's properties + ''' + keys = self._parse_identifier() + defn = None + try: + defn = UnimodModification.resolver(*keys) + except KeyError: + pass + if defn is not None: + return defn + raise KeyError(keys) + + + def split_tags(tokens): '''Split a token array into discrete sets of tag tokens. @@ -524,9 +636,9 @@ def __repr__(self): return "{self.__class__.__name__}({self.start}, {self.end}, {self.tag})".format(self=self) -class TagParser(object): - '''A parser which accumulates tokens until it is asked to parse them into - :class:`TagBase` instances. +class TokenBuffer(object): + '''A token buffer that wraps the accumulation and reset logic + of a list of :class:`str` objects. Implements a subset of the Sequence protocol. @@ -534,19 +646,9 @@ class TagParser(object): ---------- buffer: list The list of tokens accumulated since the last parsing. - group_ids: set - The set of all group IDs that have been produced so far. ''' - - def __init__(self, initial=None, group_ids=None): - if initial: - self.buffer = list(initial) - else: - self.buffer = [] - if group_ids: - self.group_ids = set(group_ids) - else: - self.group_ids = set() + def __init__(self, initial=None): + self.buffer = list(initial or []) def append(self, c): '''Append a new character to the buffer. @@ -575,6 +677,53 @@ def __getitem__(self, i): def __len__(self): return len(self.buffer) + def process(self): + value = self.buffer + self.reset() + return value + + def __call__(self): + return self.process() + + +class NumberParser(TokenBuffer): + '''A buffer which accumulates tokens until it is asked to parse them into + :class:`int` instances. + + Implements a subset of the Sequence protocol. + + Attributes + ---------- + buffer: list + The list of tokens accumulated since the last parsing. + ''' + def process(self): + value = int(''.join(self)) + self.reset() + return value + + +class TagParser(TokenBuffer): + '''A buffer which accumulates tokens until it is asked to parse them into + :class:`TagBase` instances. + + Implements a subset of the Sequence protocol. + + Attributes + ---------- + buffer: list + The list of tokens accumulated since the last parsing. + group_ids: set + The set of all group IDs that have been produced so far. + ''' + + def __init__(self, initial=None, group_ids=None): + super(TagParser, self).__init__(initial) + if group_ids: + self.group_ids = set(group_ids) + else: + self.group_ids = set() + def process(self): '''Parse the content of the internal buffer, clear the buffer, and return the parsed tag. @@ -601,7 +750,10 @@ class ParserStateEnum(Enum): interval_tag = 7 tag_after_sequence = 8 stable_isotope = 9 - + post_tag_before = 10 + unlocalized_count = 11 + post_global = 12 + post_global_aa = 13 done = 999 @@ -615,6 +767,10 @@ class ParserStateEnum(Enum): TAG = ParserStateEnum.tag_in_sequence INTERVAL_TAG = ParserStateEnum.interval_tag TAG_AFTER = ParserStateEnum.tag_after_sequence +POST_TAG_BEFORE = ParserStateEnum.post_tag_before +UNLOCALIZED_COUNT = ParserStateEnum.unlocalized_count +POST_GLOBAL = ParserStateEnum.post_global +POST_GLOBAL_AA = ParserStateEnum.post_global_aa DONE = ParserStateEnum.done VALID_AA = set("QWERTYIPASDFGHKLCVNM") @@ -659,6 +815,8 @@ def parse_proforma(sequence): current_aa = None current_tag = TagParser() current_interval = None + current_unlocalized_count = NumberParser() + current_aa_targets = TokenBuffer() while i < n: c = sequence[i] @@ -676,7 +834,8 @@ def parse_proforma(sequence): current_aa = c state = SEQ else: - raise Exception("Error In State {state}, unexpected {c} found at index {i}".format(**locals())) + raise ProFormaError( + "Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state) elif state == SEQ: if c in VALID_AA: positions.append((current_aa, current_tag.process() if current_tag else None)) @@ -688,22 +847,22 @@ def parse_proforma(sequence): current_interval = TaggedInterval(len(positions) + 1) elif c == ')': if current_interval is None: - raise Exception("Error In State {state}, unexpected {c} found at index {i}".format(**locals())) + raise ProFormaError("Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state) else: current_interval.end = len(positions) + 1 if i >= n or sequence[i] != '[': - raise Exception("Missing Interval Tag") + raise ProFormaError("Missing Interval Tag", i, state) i += 1 depth = 1 state = INTERVAL_TAG elif c == '-': state = TAG_AFTER if i >= n or sequence[i] != '[': - raise Exception("Missing Interval Tag") + raise ProFormaError("Missing Interval Tag", i, state) i += 1 depth = 1 else: - raise Exception("Error In State {state}, unexpected {c} found at index {i}".format(**locals())) + raise ProFormaError("Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state) elif state == TAG or state == TAG_BEFORE or state == TAG_AFTER or state == GLOBAL: if c == '[': depth += 1 @@ -714,62 +873,26 @@ def parse_proforma(sequence): if state == TAG: state = SEQ elif state == TAG_BEFORE: - if i < n: - cnext = sequence[i] - if cnext == '?': - unlocalized_modifications.append(current_tag.process()) - i += 1 - elif cnext == '-': - n_term = current_tag.process() - i += 1 - else: - i += 1 - raise Exception("Error In State {state}, unexpected {cnext} found at index {i}".format(**locals())) - - state = BEFORE + state = POST_TAG_BEFORE elif state == TAG_AFTER: c_term = current_tag.process() state = DONE elif state == GLOBAL: - # Gobble the rest of the global tag inline to avoid spawning - # a whole new state. - if i < n: - c = sequence[i] - i += 1 - if c != '@': - raise Exception( - ("Error In State {state}, fixed modification detected without " - "target amino acids found at index {i}").format(**locals())) - end = 0 - targets = [] - while i < n: - c = sequence[i] - i += 1 - if c in VALID_AA: - targets.append(c) - elif c == ',': - pass - elif '>': - break - else: - raise Exception( - ("Error In State {state}, unclosed fixed modification rule").format(**locals())) - - fixed_modifications.append( - ModificationRule(current_tag.process(), targets)) - state = BEFORE + state = POST_GLOBAL else: current_tag.append(c) elif state == FIXED: if c == '[': state = GLOBAL else: + # Do validation here state = ISOTOPE current_tag.append(c) elif state == ISOTOPE: if c != '>': current_tag.append(c) else: + # Not technically a tag, but exploits the current buffer isotopes.append(StableIsotope(''.join(current_tag))) current_tag.reset() state = BEFORE @@ -797,10 +920,58 @@ def parse_proforma(sequence): state = SEQ else: current_tag.append(c) + elif state == POST_TAG_BEFORE: + if c == '?': + unlocalized_modifications.append(current_tag.process()) + state = BEFORE + elif c == '-': + n_term = current_tag.process() + state = BEFORE + elif c == '^': + state = UNLOCALIZED_COUNT + else: + raise ProFormaError( + "Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state) + elif state == UNLOCALIZED_COUNT: + if c.isdigit(): + current_unlocalized_count.append(c) + elif c == '[': + state = TAG_BEFORE + depth = 1 + tag = current_tag.process() + multiplicity = current_unlocalized_count.process() + for i in range(multiplicity): + unlocalized_modifications.append(tag) + elif c == '?': + state = BEFORE + tag = current_tag.process() + multiplicity = current_unlocalized_count.process() + for i in range(multiplicity): + unlocalized_modifications.append(tag) + else: + raise ProFormaError( + "Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state) + elif state == POST_GLOBAL: + if c == '@': + state = POST_GLOBAL_AA + else: + raise ProFormaError( + ("Error In State {state}, fixed modification detected without " + "target amino acids found at index {i}").format(**locals()), i, state) + elif state == POST_GLOBAL_AA: + if c in VALID_AA: + current_aa_targets.append(c) + elif c == '>': + fixed_modifications.append( + ModificationRule(current_tag.process(), current_aa_targets.process())) + state = BEFORE + else: + raise ProFormaError( + ("Error In State {state}, unclosed fixed modification rule").format(**locals()), i, state) else: - raise Exception("Error In State {state}, unexpected {c} found at index {i}".format(**locals())) + raise ProFormaError("Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state) if state in (ISOTOPE, TAG, TAG_AFTER, TAG_BEFORE, LABILE, ): - raise Exception("Error In State {state}, unclosed group reached end of string!".format(**locals())) + raise ProFormaError("Error In State {state}, unclosed group reached end of string!".format(**locals()), i, state) if current_aa: positions.append((current_aa, current_tag.process() if current_tag else None)) return positions, { @@ -811,5 +982,5 @@ def parse_proforma(sequence): 'fixed_modifications': fixed_modifications, 'intervals': intervals, 'isotopes': isotopes, - 'group_ids': list(current_tag.group_ids) + 'group_ids': sorted(current_tag.group_ids) } From 0b79ccc761fe52123abf6fae9a0d0bdecb084ef4 Mon Sep 17 00:00:00 2001 From: Joshua Klein Date: Thu, 7 Jan 2021 08:51:32 -0500 Subject: [PATCH 05/27] Clean up markers --- pyteomics/proforma.py | 302 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 281 insertions(+), 21 deletions(-) diff --git a/pyteomics/proforma.py b/pyteomics/proforma.py index c98d29a9..6b4b39c1 100644 --- a/pyteomics/proforma.py +++ b/pyteomics/proforma.py @@ -27,7 +27,8 @@ ''' import re -from collections import namedtuple, defaultdict +from collections import namedtuple, defaultdict, deque +from functools import partial try: from enum import Enum @@ -38,10 +39,22 @@ from six import add_metaclass from pyteomics import parser -from pyteomics.mass import Composition -from pyteomics.auxiliary import PyteomicsError +from pyteomics.mass import Composition, std_aa_mass +from pyteomics.auxiliary import PyteomicsError, BasicComposition from pyteomics.mass import Unimod +# To eventually be implemented with pyteomics port? +try: + from psims.controlled_vocabulary.controlled_vocabulary import (load_psimod, load_xlmod, load_gno, obo_cache) +except ImportError: + def _needs_psims(name): + raise ImportError("Loading %s requires the `psims` library. To access it, please install `psims" % name) + + load_psimod = partial(_needs_psims, 'PSIMOD') + load_xlmod = partial(_needs_psims, 'XLMOD') + load_gno = partial(_needs_psims, 'GNO') + obo_cache = None + class ProFormaError(PyteomicsError): def __init__(self, message, index=None, parser_state=None, **kwargs): @@ -155,7 +168,20 @@ def find_extra(self, label): return out -class PositionLabelTag(TagBase): +class GroupLabelBase(TagBase): + __slots__ = () + + def __str__(self): + part = self._format_main() + if self.extra: + rest = [str(e) for e in self.extra] + label = '|'.join([part] + rest) + else: + label = part + return '%s' % label + + +class PositionLabelTag(GroupLabelBase): '''A tag to mark that a position is involved in a group in some way, but does not imply any specific semantics. ''' @@ -167,10 +193,10 @@ def __init__(self, value=None, extra=None, group_id=None): TagTypeEnum.position_label, group_id, extra, group_id) def _format_main(self): - return "#{self.group_id}".format(self=self) + return "{self.group_id}".format(self=self) -class LocalizationMarker(TagBase): +class LocalizationMarker(GroupLabelBase): '''A tag to mark a particular localization site ''' __slots__ = () @@ -181,7 +207,7 @@ def __init__(self, value, extra=None, group_id=None): TagTypeEnum.localization_marker, float(value), extra, group_id) def _format_main(self): - return "#{self.group_id}({self.value!f})".format(self=self) + return "{self.group_id}({self.value:.4g})".format(self=self) class InformationTag(TagBase): @@ -211,12 +237,29 @@ def __init__(self, value, extra=None, group_id=None): TagTypeEnum.massmod, float(value), extra, group_id) def _format_main(self): - return '%0.4f' % self.value + if self.value >= 0: + return ('+%0.4g' % self.value).rstrip('0').rstrip('.') + else: + return ('%0.4g' % self.value).rstrip('0').rstrip('.') + + @property + def mass(self): + return self.value class ModificationResolver(object): def __init__(self, name, *args, **kwargs): self.name = name + self._database = None + + def load_database(self): + raise NotImplementedError() + + @property + def database(self): + if not self._database: + self._database = self.load_database() + return self._database def resolve(self, name=None, id=None, **kwargs): raise NotImplementedError() @@ -228,13 +271,10 @@ def __call__(self, name=None, id=None, **kwargs): class UnimodResolver(ModificationResolver): def __init__(self, *args, **kwargs): super(UnimodResolver, self).__init__("unimod", *args, **kwargs) - self._database = kwargs.get("database" ) + self._database = kwargs.get("database") - @property - def database(self): - if not self._database: - self._database = Unimod() - return self._database + def load_database(self): + return Unimod() def resolve(self, name=None, id=None, **kwargs): if name is not None: @@ -258,6 +298,71 @@ def resolve(self, name=None, id=None, **kwargs): } +class PSIModResolver(ModificationResolver): + def __init__(self, *args, **kwargs): + super(PSIModResolver, self).__init__('psimod', *args, **kwargs) + self._database = kwargs.get("database") + + def load_database(self): + return load_psimod() + + def resolve(self, name=None, id=None, **kwargs): + if name is None: + defn = self.database[name] + elif id is None: + defn = self.database['MOD:{:05d}'.format(id)] + else: + raise ValueError("Must provide one of `name` or `id`") + mass = float(defn.DiffMono.strip()[1:-1]) + composition = Composition(defn.DiffFormula.strip()[1:-1].replace(" ", '')) + return { + 'mass': mass, + 'composition': composition, + 'name': defn.name, + 'id': defn.id, + 'provider': self.name + } + + +class XLMODResolver(ModificationResolver): + def __init__(self, *args, **kwargs): + super(XLMODResolver, self).__init__('xlmod', *args, **kwargs) + self._database = kwargs.get("database") + + def load_database(self): + return load_psimod() + + def resolve(self, name=None, id=None, **kwargs): + if name is None: + defn = self.database[name] + elif id is None: + defn = self.database['XLMOD:{:05d}'.format(id)] + else: + raise ValueError("Must provide one of `name` or `id`") + mass = float(defn['monoIsotopicMass']) + if 'deadEndFormula' in defn: + composition = Composition(defn['deadEndFormula'].replace(" ", '').replace("D", "H[2]")) + elif 'bridgeFormula' in defn: + composition = Composition( + defn['bridgeFormula'].replace(" ", '').replace("D", "H[2]")) + return { + 'mass': mass, + 'composition': composition, + 'name': defn.name, + 'id': defn.id, + 'provider': self.name + } + + +class GNOResolver(ModificationResolver): + def __init__(self, *args, **kwargs): + super(GNOResolver, self).__init__('gnome', *args, **kwargs) + self._database = kwargs.get("database") + + def load_database(self): + return load_gno() + + class ModificationBase(TagBase): '''A base class for all modification tags with marked prefixes. ''' @@ -331,7 +436,6 @@ class FormulaModification(ModificationBase): def resolve(self): # The handling of fixed isotopes is wrong here as Pyteomics uses a different # convention. - from pyteomics.mass import Composition composition = Composition(formula=''.join(self.value.split(" "))) return { "mass": composition.mass(), @@ -345,6 +449,44 @@ class GlycanModification(ModificationBase): _tag_type = TagTypeEnum.glycan + valid_monosaccharides = { + "Hex": (162.0528, Composition("C6H10O5")), + "HexNAc": (203.0793, Composition("C6H13N1O5")), + "HexS": (242.009, Composition("C8H10O8S1")), + "HexP": (242.0191, Composition("C6H11O8P1")), + "HexNAcS": (283.0361, Composition("C8H13N1O8S1")), + "dHex": (146.0579, Composition("C6H10O4")), + "NeuAc": (291.0954, Composition("C11H17N1O8")), + "NeuGc": (307.0903, Composition("C11H17N1O9")), + "Pen": (132.0422, Composition("C5H8O4")), + "Fuc": (146.0579, Composition("C6H10O4")) + } + + tokenizer = re.compile(r"([A-Za-z]+)\s*(\d*)\s*") + + def resolve(self): + composite = BasicComposition() + for tok, cnt in self.tokenizer.findall(self.value): + if cnt: + cnt = int(cnt) + else: + cnt = 1 + if tok not in self.valid_monosaccharides: + raise ValueError(f"{tok!r} is not a valid monosaccharide name") + composite[tok] += cnt + mass = 0 + chemcomp = Composition() + for key, cnt in composite.items(): + m, c = self.valid_monosaccharides[key] + mass += m * cnt + chemcomp += c * cnt + return { + "mass": mass, + "composition": chemcomp, + "name": self.value, + "monosaccharides": composite + } + class UnimodModification(ModificationBase): __slots__ = () @@ -359,6 +501,8 @@ class UnimodModification(ModificationBase): class PSIModModification(ModificationBase): __slots__ = () + resolver = PSIModResolver() + prefix_name = "MOD" short_prefix = 'M' _tag_type = TagTypeEnum.psimod @@ -375,6 +519,8 @@ class GNOmeModification(ModificationBase): class XLMODModification(ModificationBase): __slots__ = () + resolver = XLMODResolver() + prefix_name = "XLMOD" # short_prefix = 'XL' _tag_type = TagTypeEnum.xlmod @@ -406,7 +552,6 @@ def resolve(self): raise KeyError(keys) - def split_tags(tokens): '''Split a token array into discrete sets of tag tokens. @@ -434,7 +579,15 @@ def split_tags(tokens): out = [] for i, start in enumerate(starts): end = ends[i] - out.append(tokens[start:end]) + tag = tokens[start:end] + if len(tag) == 0: + continue + # Short circuit on INFO tags which can't be broken + # if (tag[0] == 'i' and tag[:5] == ['i', 'n', 'f', 'o', ':']) or (tag[0] == 'I' and tag[:5] == ['I', 'N', 'F', 'O', ':']): + # tag = tokens[start:] + # out.append(tag) + # break + out.append(tag) return out @@ -566,7 +719,7 @@ def __ne__(self, other): def __str__(self): targets = ','.join(self.targets) - return "<{self.modification_tag}@{targets}>".format(self=self, targets=targets) + return "<[{self.modification_tag}]@{targets}>".format(self=self, targets=targets) def __repr__(self): return "{self.__class__.__name__}({self.modification_tag!r}, {self.targets})".format(self=self) @@ -790,7 +943,7 @@ def parse_proforma(sequence): Returns ------- - parsed_sequence: list + parsed_sequence: list[tuple[str, TagBase]] The (amino acid: str, TagBase or None) pairs denoting the positions along the primary sequence modifiers: dict A mapping listing the labile modifications, fixed modifications, stable isotopes, unlocalized @@ -970,10 +1123,10 @@ def parse_proforma(sequence): ("Error In State {state}, unclosed fixed modification rule").format(**locals()), i, state) else: raise ProFormaError("Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state) - if state in (ISOTOPE, TAG, TAG_AFTER, TAG_BEFORE, LABILE, ): - raise ProFormaError("Error In State {state}, unclosed group reached end of string!".format(**locals()), i, state) if current_aa: positions.append((current_aa, current_tag.process() if current_tag else None)) + if state in (ISOTOPE, TAG, TAG_AFTER, TAG_BEFORE, LABILE, ): + raise ProFormaError("Error In State {state}, unclosed group reached end of string!".format(**locals()), i, state) return positions, { 'n_term': n_term, 'c_term': c_term, @@ -984,3 +1137,110 @@ def parse_proforma(sequence): 'isotopes': isotopes, 'group_ids': sorted(current_tag.group_ids) } + + +def to_proforma(sequence, n_term=None, c_term=None, unlocalized_modifications=None, + labile_modifications=None, fixed_modifications=None, intervals=None, + isotopes=None, group_ids=None): + '''Convert a sequence plus modifiers into formatted text following the + ProForma specification. + + Parameters + ---------- + sequence : list[tuple[str, TagBase]] + The primary sequence of the peptidoform/proteoform to render + n_term : Optional[TagBase] + The N-terminal modification, if any. + c_term : Optional[TagBase] + The C-terminal modification, if any. + unlocalized_modifications : Optional[list[TagBase]] + Any modifications which aren't assigned to a specific location. + labile_modifications : Optional[list[TagBase]] + Any labile modifications + fixed_modifications : Optional[list[ModificationRule]] + Any fixed modifications + intervals : Optional[list[TaggedInterval]] + A list of modified intervals, if any + isotopes : Optional[list[StableIsotope]] + Any global stable isotope labels applied + group_ids : Optional[list[str]] + Any group identifiers. This parameter is currently not used. + + Returns + ------- + str + ''' + primary = deque(['{0!s}[{1!s}]'.format(*p) if p[1] else p[0] for p in sequence]) + if intervals: + for iv in sorted(intervals, key=lambda x: x.start): + primary[iv.start] = '(' + primary[iv.start] + primary[iv.end - 1] = '{0!s})[{1!s}]'.format(primary[iv.end - 1], iv.tag) + if n_term: + primary.appendleft("[{!s}]-".format(n_term)) + if c_term: + primary.append('-[{!s}]'.format(c_term)) + if labile_modifications: + primary.extendleft(['{{{!s}}}'.format(m) for m in labile_modifications]) + if unlocalized_modifications: + primary.appendleft("?") + primary.extendleft(['[{!s}]'.format(m) for m in unlocalized_modifications]) + if isotopes: + primary.extendleft(['{!s}'.format(m) for m in isotopes]) + if fixed_modifications: + primary.extendleft(['{!s}'.format(m) for m in fixed_modifications]) + return ''.join(primary) + + +class ProForma(object): + def __init__(self, sequence, properties): + self.sequence = sequence + self.properties = properties + + def __str__(self): + return to_proforma(self.sequence, **self.properties) + + def __repr__(self): + return "{self.__class__.__name__}({self.sequence}, {self.properties})".format(self=self) + + def __getitem__(self, i): + if isinstance(i, slice): + return self.__class__(self.sequence[i], self.properties) + else: + return self.sequence[i] + + @classmethod + def parse(cls, string): + return cls(*parse_proforma(string)) + + @property + def mass(self): + mass = 0.0 + + fixed_modifications = self.properties['fixed_modifications'] + fixed_rules = {} + for rule in fixed_modifications: + for aa in rule.targets: + fixed_rules[aa] = rule.modification_tag.mass + + for position in self.sequence: + aa = position[0] + mass += std_aa_mass[aa] + if aa in fixed_rules: + mass += fixed_rules[aa] + tag = position[1] + if tag: + try: + mass += tag.mass + except (AttributeError, KeyError): + continue + for mod in self.properties['labile_modifications']: + mass += mod.mass + for mod in self.properties['unlocalized_modifications']: + mass += mod.mass + for iv in self.properties['intervals']: + try: + mass += iv.tag.mass + except (AttributeError, KeyError): + continue + return mass + From d069380e077099f4869d8664eb411c076449d122 Mon Sep 17 00:00:00 2001 From: Joshua Klein Date: Wed, 13 Jan 2021 20:24:03 -0500 Subject: [PATCH 06/27] More proforma parsing experiments --- proforma_parsing.ipynb | 906 ++++++----------------------------------- 1 file changed, 121 insertions(+), 785 deletions(-) diff --git a/proforma_parsing.ipynb b/proforma_parsing.ipynb index 6ca08254..dfca8348 100644 --- a/proforma_parsing.ipynb +++ b/proforma_parsing.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 172, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -22,813 +22,105 @@ }, { "cell_type": "code", - "execution_count": 173, + "execution_count": 40, "metadata": {}, "outputs": [], "source": [ - "class PrefixSavingMeta(type):\n", - " '''A subclass-registering-metaclass that provides easy\n", - " lookup of subclasses by prefix attributes.\n", - " '''\n", + "import importlib\n", + "from pyteomics import proforma\n", + "importlib.reload(proforma)\n", "\n", - " def __new__(mcs, name, parents, attrs):\n", - " new_type = type.__new__(mcs, name, parents, attrs)\n", - " prefix = attrs.get(\"prefix_name\")\n", - " if prefix:\n", - " new_type.prefix_map[prefix.lower()] = new_type\n", - " short = attrs.get(\"short_prefix\")\n", - " if short:\n", - " new_type.prefix_map[short.lower()] = new_type\n", - " return new_type\n", - " \n", - " def find_by_tag(self, tag_name):\n", - " if tag_name is None:\n", - " raise ValueError(\"tag_name cannot be None!\")\n", - " tag_name = tag_name.lower()\n", - " return self.prefix_map[tag_name]" + "tokenize_proforma = proforma.parse_proforma\n", + "format_proforma = proforma.to_proforma" ] }, { "cell_type": "code", - "execution_count": 227, + "execution_count": 41, "metadata": {}, "outputs": [], "source": [ - "class TagTypeEnum(Enum):\n", - " unimod = 0\n", - " psimod = 1\n", - " massmod = 2\n", - " generic = 3\n", - " info = 4\n", - " gnome = 5\n", - " xlmod = 6\n", - "\n", - " formula = 7\n", - " glycan = 8\n", - "\n", - " localization_marker = 9\n", - " position_label = 10\n", - " group_placeholder = 999\n", - "\n", - "\n", - "_sentinel = object()\n", - "\n", - "\n", - "@add_metaclass(PrefixSavingMeta)\n", - "class TagBase(object):\n", - " '''A base class for all tag types.\n", - "\n", - " Attributes\n", - " ----------\n", - " type: Enum\n", - " An element of :class:`TagTypeEnum` saying what kind of tag this is.\n", - " value: object\n", - " The data stored in this tag, usually an externally controlled name\n", - " extra: list\n", - " Any extra tags that were nested within this tag. Usually limited to INFO\n", - " tags but may be other synonymous controlled vocabulary terms.\n", - " group_id: str or None\n", - " A short label denoting which group, if any, this tag belongs to\n", - " '''\n", - " __slots__ = (\"type\", \"value\", \"extra\", \"group_id\")\n", - "\n", - " prefix_name = None\n", - " short_prefix = None\n", - " prefix_map = {}\n", - " \n", - " def __init__(self, type, value, extra=None, group_id=None):\n", - " self.type = type\n", - " self.value = value\n", - " self.extra = extra\n", - " self.group_id = group_id\n", - "\n", - " def __str__(self):\n", - " part = self._format_main()\n", - " if self.extra:\n", - " rest = [str(e) for e in self.extra]\n", - " label = '|'.join([part] + rest)\n", - " else:\n", - " label = part\n", - " if self.group_id:\n", - " label = '%s%s' % (label, self.group_id)\n", - " return '%s' % label\n", - " \n", - " def __repr__(self):\n", - " template = \"{self.__class__.__name__}({self.value!r}, {self.extra!r}, {self.group_id!r})\"\n", - " return template.format(self=self)\n", - " \n", - " def __eq__(self, other):\n", - " if other is None:\n", - " return False\n", - " return (self.type == other.type) and (self.value == other.value) and (self.extra == other.extra) \\\n", - " and (self.group_id == other.group_id)\n", - "\n", - " def __ne__(self, other):\n", - " return not self == other\n", - "\n", - " def find_extra(self, label):\n", - " out = []\n", - " if not self.extra:\n", - " return out\n", - " for e in self.extra:\n", - " if e.type == label:\n", - " out.append(e)\n", - " return out\n", - "\n", - "\n", - "class PositionLabelTag(TagBase):\n", - " '''A tag to mark that a position is involved in a group in some way, but does\n", - " not imply any specific semantics.\n", - " '''\n", - " __slots__ = ()\n", - "\n", - " def __init__(self, value=None, extra=None, group_id=None):\n", - " assert group_id is not None\n", - " super(PositionLabelTag, self).__init__(TagTypeEnum.position_label, group_id, extra, group_id)\n", - " \n", - " def _format_main(self):\n", - " return \"#{self.group_id}\".format(self=self)\n", - "\n", - "\n", - "class LocalizationMarker(TagBase):\n", - " '''A tag to mark a particular localization site \n", - " '''\n", - " __slots__ = ()\n", - " \n", - " def __init__(self, value, extra=None, group_id=None):\n", - " assert group_id is not None\n", - " super(LocalizationMarker, self).__init__(TagTypeEnum.localization_marker, float(value), extra, group_id)\n", - " \n", - " def _format_main(self):\n", - " return \"#{self.group_id}({self.value!f})\".format(self=self)\n", - "\n", - "\n", - "class InformationTag(TagBase):\n", - " '''A tag carrying free text describing the location\n", - " '''\n", - " __slots__ = ()\n", - "\n", - " prefix_name = \"INFO\"\n", - "\n", - " def __init__(self, value, extra=None, group_id=None):\n", - " super(InformationTag, self).__init__(TagTypeEnum.info, str(value), extra, group_id)\n", - "\n", - " def _format_main(self):\n", - " return str(self.value)\n", - "\n", - "\n", - "class MassModification(TagBase):\n", - " '''A modification defined purely by a signed mass shift in Daltons.\n", - "\n", - " The value of a :class:`MassModification` is always a :class:`float`\n", - " '''\n", - " __slots__ = ()\n", - " \n", - " def __init__(self, value, extra=None, group_id=None):\n", - " super(MassModification, self).__init__(TagTypeEnum.massmod, float(value), extra, group_id)\n", - " \n", - " def _format_main(self):\n", - " return '%0.4f' % self.value\n", - "\n", - "\n", - " \n", - "class ModificationBase(TagBase):\n", - " '''A base class for all modification tags with marked prefixes.\n", - " '''\n", - "\n", - " _tag_type = None\n", - " __slots__ = ()\n", - " \n", - " def __init__(self, value, extra=None, group_id=None):\n", - " super(ModificationBase, self).__init__(\n", - " self._tag_type, value, extra, group_id)\n", - "\n", - " def _format_main(self):\n", - " return \"{self.prefix_name}:{self.value}\".format(self=self)\n", - " \n", - " def resolve(self):\n", - " '''Find the term and return it's properties\n", - " '''\n", - " raise NotImplementedError()\n", - "\n", - "\n", - "class FormulaModification(ModificationBase):\n", - " prefix_name = \"Formula\"\n", - "\n", - " _tag_type = TagTypeEnum.formula\n", - "\n", - " def resolve(self):\n", - " # The handling of fixed isotopes is wrong here as Pyteomics uses a different\n", - " # convention.\n", - " from pyteomics.mass import Composition\n", - " composition = Composition(formula=''.join(self.value.split(\" \")))\n", - " return {\n", - " \"mass\": composition.mass(),\n", - " \"composition\": composition\n", - " }\n", - "\n", - "\n", - "class GlycanModification(ModificationBase):\n", - " prefix_name = \"Glycan\"\n", - "\n", - " _tag_type = TagTypeEnum.glycan\n", - "\n", - " \n", - "class GenericModification(TagBase):\n", - " __slots__ = ()\n", - " \n", - " def __init__(self, value, extra=None, group_id=None):\n", - " super(GenericModification, self).__init__(TagTypeEnum.generic, value, extra, group_id)\n", - " \n", - " def _format_main(self):\n", - " return self.value\n", - "\n", - " def resolve(self):\n", - " '''Find the term, searching through all available vocabularies and\n", - " return the first match's properties\n", - " '''\n", - " raise NotImplementedError()\n", - "\n", - "\n", - "class UnimodModification(ModificationBase):\n", - " __slots__ = ()\n", - " \n", - " prefix_name = \"UNIMOD\"\n", - " short_prefix = \"U\"\n", - " _tag_type = TagTypeEnum.unimod\n", - "\n", - "\n", - "class PSIModModification(ModificationBase):\n", - " __slots__ = ()\n", - " \n", - " prefix_name = \"MOD\"\n", - " short_prefix = 'M'\n", - " _tag_type = TagTypeEnum.psimod\n", - "\n", - "\n", - "class GNOmeModification(ModificationBase):\n", - " __slots__ = ()\n", - " \n", - " prefix_name = \"GNO\"\n", - " # short_prefix = 'G'\n", - " _tag_type = TagTypeEnum.gnome\n", - "\n", - " \n", - "class XLMODModification(ModificationBase):\n", - " __slots__ = ()\n", - " \n", - " prefix_name = \"XLMOD\"\n", - " # short_prefix = 'XL'\n", - " _tag_type = TagTypeEnum.xlmod\n", - "\n", - "\n", - "def split_tags(tokens):\n", - " '''Split a token array into discrete sets of tag\n", - " tokens.\n", - "\n", - " Parameters\n", - " ----------\n", - " tokens: list\n", - " The characters of the tag token buffer\n", - " \n", - " Returns\n", - " -------\n", - " list of list:\n", - " The tokens for each contained tag\n", - " '''\n", - " starts = [0]\n", - " ends = []\n", - " for i, c in enumerate(tokens):\n", - " if c == '|':\n", - " ends.append(i)\n", - " starts.append(i + 1)\n", - " elif (i != 0 and c == '#'):\n", - " ends.append(i)\n", - " starts.append(i)\n", - " ends.append(len(tokens))\n", - " out = []\n", - " for i, start in enumerate(starts):\n", - " end = ends[i]\n", - " out.append(tokens[start:end])\n", - " return out\n", - "\n", - "\n", - "def find_prefix(tokens):\n", - " '''Find the prefix, if any of the tag defined by `tokens`\n", - " delimited by \":\".\n", - "\n", - " Parameters\n", - " ----------\n", - " tokens: list\n", - " The tag tokens to search\n", - " \n", - " Returns\n", - " -------\n", - " prefix: str or None\n", - " The prefix string, if found\n", - " rest: str\n", - " The rest of the tokens, merged as a string\n", - " '''\n", - " for i, c in enumerate(tokens):\n", - " if c == ':':\n", - " return ''.join(tokens[:i]), ''.join(tokens[i + 1:])\n", - " return None, tokens\n", - "\n", - "def process_marker(tokens):\n", - " '''Process a marker, which is a tag whose value starts with #.\n", - "\n", - " Parameters\n", - " ----------\n", - " tokens: list\n", - " The tag tokens to parse\n", - "\n", - " Returns\n", - " -------\n", - " PositionLabelTag or LocalizationMarker\n", - " '''\n", - " if tokens[1:3] == 'XL':\n", - " return PositionLabelTag(None, group_id=''.join(tokens))\n", - " else:\n", - " group_id = None\n", - " value = None\n", - " for i, c in enumerate(tokens):\n", - " if c == '(':\n", - " group_id = ''.join(tokens[:i])\n", - " if tokens[-1] != ')':\n", - " raise Exception(\"Localization marker with score missing closing parenthesis\")\n", - " value = float(''.join(tokens[i + 1:-1]))\n", - " return LocalizationMarker(value, group_id=group_id)\n", - " else:\n", - " group_id = ''.join(tokens)\n", - " return PositionLabelTag(group_id=group_id)\n", - " \n", - "\n", - "\n", - "def process_tag_tokens(tokens):\n", - " '''Convert a tag token buffer into a parsed :class:`TagBase` instance\n", - " of the appropriate sub-type with zero or more sub-tags.\n", - "\n", - " Parameters\n", - " ----------\n", - " tokens: list\n", - " The tokens to parse\n", - " \n", - " Returns\n", - " -------\n", - " TagBase:\n", - " The parsed tag\n", - " '''\n", - " parts = split_tags(tokens)\n", - " main_tag = parts[0]\n", - " if main_tag[0] in ('+', '-'):\n", - " main_tag = ''.join(main_tag)\n", - " main_tag = MassModification(main_tag)\n", - " elif main_tag[0] == '#':\n", - " main_tag = process_marker(main_tag)\n", - " else:\n", - " prefix, value = find_prefix(main_tag)\n", - " if prefix is None:\n", - " main_tag = GenericModification(''.join(value))\n", - " else:\n", - " tag_type = TagBase.find_by_tag(prefix)\n", - " main_tag = tag_type(value)\n", - " if len(parts) > 1:\n", - " extras = []\n", - " for part in parts[1:]:\n", - " prefix, value = find_prefix(part)\n", - " if prefix is None:\n", - " if value[0] == \"#\":\n", - " marker = process_marker(value)\n", - " if isinstance(marker, PositionLabelTag):\n", - " main_tag.group_id = ''.join(value)\n", - " else:\n", - " main_tag.group_id = marker.group_id\n", - " extras.append(marker)\n", - " else:\n", - " extras.append(GenericModification(''.join(value)))\n", - " else:\n", - " tag_type = TagBase.find_by_tag(prefix)\n", - " extras.append(tag_type(value))\n", - " main_tag.extra = extras\n", - " return main_tag" + "seq, props = proforma.parse_proforma(\"{Glycan:Hex 1 HexNAc 2 NeuAc 1}STYGIAN\")" ] }, { "cell_type": "code", - "execution_count": 228, + "execution_count": 43, "metadata": {}, "outputs": [], "source": [ - "class ModificationRule(object):\n", - " '''Define a fixed modification rule which dictates a modification tag is\n", - " always applied at one or more amino acid residues.\n", - "\n", - " Attributes\n", - " ----------\n", - " modification_tag: TagBase\n", - " The modification to apply\n", - " targets: list\n", - " The list of amino acids this applies to\n", - " '''\n", - " __slots__ = ('modification_tag', 'targets')\n", - "\n", - " def __init__(self, modification_tag, targets=None):\n", - " self.modification_tag = modification_tag\n", - " self.targets = targets\n", - "\n", - " def __eq__(self, other):\n", - " if other is None:\n", - " return False\n", - " return self.modification_tag == other.modification_tag and self.targets == other.targets\n", - "\n", - " def __ne__(self, other):\n", - " return not self == other\n", - "\n", - " def __str__(self):\n", - " targets = ','.join(self.targets)\n", - " return \"<{self.modification_tag}@{targets}>\".format(self=self, targets=targets)\n", - "\n", - " def __repr__(self):\n", - " return \"{self.__class__.__name__}({self.modification_tag!r}, {self.targets})\".format(self=self)\n", - "\n", - "\n", - "class StableIsotope(object):\n", - " '''Define a fixed isotope that is applied globally to all amino acids.\n", - "\n", - " Attributes\n", - " ----------\n", - " isotope: str\n", - " The stable isotope string, of the form [] or a special\n", - " isotopoform's name.\n", - " '''\n", - " __slots__ = ('isotope', )\n", - "\n", - " def __init__(self, isotope):\n", - " self.isotope = isotope\n", - "\n", - " def __eq__(self, other):\n", - " if other is None:\n", - " return False\n", - " return self.isotope == other.isotope\n", - "\n", - " def __ne__(self, other):\n", - " return not self == other\n", - "\n", - " def __str__(self):\n", - " return \"<{self.isotope}>\".format(self=self)\n", - "\n", - " def __repr__(self):\n", - " return \"{self.__class__.__name__}({self.isotope})\".format(self=self)\n", - "\n", - "\n", - "class TaggedInterval(object):\n", - " '''Define a fixed interval over the associated sequence which contains the localization\n", - " of the associated tag.\n", - "\n", - " Attributes\n", - " ----------\n", - " start: int\n", - " The starting position (inclusive) of the interval along the primary sequence\n", - " end: int\n", - " The ending position (exclusive) of the interval along the primary sequence\n", - " tag: TagBase\n", - " The tag being localized\n", - " '''\n", - " __slots__ = ('start', 'end', 'tag')\n", - "\n", - " def __init__(self, start, end=None, tag=None):\n", - " self.start = start\n", - " self.end = end\n", - " self.tag = tag\n", - " \n", - " def __eq__(self, other):\n", - " if other is None:\n", - " return False\n", - " return self.start == other.start and self.end == other.end and self.tag == other.tag\n", - "\n", - " def __ne__(self, other):\n", - " return not self == other\n", - "\n", - " def __str__(self):\n", - " return \"({self.start}-{self.end}){self.tag!r}\".format(self=self)\n", - "\n", - " def __repr__(self):\n", - " return \"{self.__class__.__name__}({self.start}, {self.end}, {self.tag})\".format(self=self)\n" + "p = proforma.ProForma.parse(\"{Glycan:Hex1HexNAc2NeuAc1#g1}S[#g1]T[#g1]YGIANS[#g1]EQ\")" ] }, { "cell_type": "code", - "execution_count": 229, + "execution_count": 48, "metadata": {}, - "outputs": [], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "{'n_term': None,\n", + " 'c_term': None,\n", + " 'unlocalized_modifications': [],\n", + " 'labile_modifications': [GlycanModification('Hex1HexNAc2NeuAc1', None, None)],\n", + " 'fixed_modifications': [],\n", + " 'intervals': [],\n", + " 'isotopes': [],\n", + " 'group_ids': []}" + ] + }, + "metadata": {}, + "execution_count": 48 + } + ], "source": [ - "class TagParser(object):\n", - " '''A parser which accumulates tokens until it is asked to parse them into\n", - " :class:`TagBase` instances.\n", - "\n", - " Implements a subset of the Sequence protocol.\n", - "\n", - " Attributes\n", - " ----------\n", - " buffer: list\n", - " The list of tokens accumulated since the last parsing.\n", - " group_ids: set\n", - " The set of all group IDs that have been produced so far.\n", - " '''\n", - "\n", - " def __init__(self, initial=None, group_ids=None):\n", - " if initial:\n", - " self.buffer = list(initial)\n", - " else:\n", - " self.buffer = []\n", - " if group_ids:\n", - " self.group_ids = set(group_ids)\n", - " else:\n", - " self.group_ids = set()\n", - " \n", - " def append(self, c):\n", - " '''Append a new character to the buffer.\n", - "\n", - " Parameters\n", - " ----------\n", - " c: str\n", - " The character appended\n", - " '''\n", - " self.buffer.append(c)\n", - " \n", - " def reset(self):\n", - " '''Discard the content of the current buffer.\n", - " '''\n", - " self.buffer = []\n", - " \n", - " def __bool__(self):\n", - " return bool(self.buffer)\n", - " \n", - " def __iter__(self):\n", - " return iter(self.buffer)\n", - "\n", - " def __getitem__(self, i):\n", - " return self.buffer[i]\n", - " \n", - " def __len__(self):\n", - " return len(self.buffer)\n", - "\n", - " def process(self):\n", - " '''Parse the content of the internal buffer, clear the buffer,\n", - " and return the parsed tag.\n", - "\n", - " Returns\n", - " -------\n", - " TagBase\n", - " '''\n", - " tag = process_tag_tokens(self.buffer)\n", - " if tag.group_id:\n", - " self.group_ids.add(tag.group_id)\n", - " self.reset()\n", - " return tag" + "p.properties" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "metadata": {}, - "outputs": [], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "1" + ] + }, + "metadata": {}, + "execution_count": 25 + } + ], "source": [] }, { "cell_type": "code", - "execution_count": 230, + "execution_count": 12, "metadata": {}, - "outputs": [], - "source": [ - "class ParserStateEnum(Enum):\n", - " before_sequence = 0\n", - " tag_before_sequence = 1\n", - " global_tag = 2\n", - " fixed_spec = 3\n", - " labile_tag = 4\n", - " sequence = 5\n", - " tag_in_sequence = 6\n", - " interval_tag = 7\n", - " tag_after_sequence = 8\n", - " stable_isotope = 9\n", - "\n", - " done = 999\n", - "\n", - "\n", - "BEFORE = ParserStateEnum.before_sequence\n", - "TAG_BEFORE = ParserStateEnum.tag_before_sequence\n", - "FIXED = ParserStateEnum.fixed_spec\n", - "GLOBAL = ParserStateEnum.global_tag\n", - "ISOTOPE = ParserStateEnum.stable_isotope\n", - "LABILE = ParserStateEnum.labile_tag\n", - "SEQ = ParserStateEnum.sequence\n", - "TAG = ParserStateEnum.tag_in_sequence\n", - "INTERVAL_TAG = ParserStateEnum.interval_tag\n", - "TAG_AFTER = ParserStateEnum.tag_after_sequence\n", - "DONE = ParserStateEnum.done\n", - "\n", - "VALID_AA = set(\"QWERTYIPASDFGHKLCVNM\")\n", - "\n", - "def parse_proforma(sequence):\n", - " '''Tokenize a ProForma sequence into a sequence of amino acid+tag positions, and a\n", - " mapping of sequence-spanning modifiers.\n", - "\n", - " .. note::\n", - " This is a state machine parser, but with certain sub-state paths\n", - " unrolled to avoid an explosion of formal intermediary states.\n", - "\n", - " Parameters\n", - " ----------\n", - " sequence: str\n", - " The sequence to parse\n", - " \n", - " Returns\n", - " -------\n", - " parsed_sequence: list\n", - " The (amino acid: str, TagBase or None) pairs denoting the positions along the primary sequence\n", - " modifiers: dict\n", - " A mapping listing the labile modifications, fixed modifications, stable isotopes, unlocalized\n", - " modifications, tagged intervals, and group IDs\n", - " '''\n", - " labile_modifications = []\n", - " fixed_modifications = []\n", - " unlocalized_modifications = []\n", - " intervals = []\n", - " isotopes = []\n", - " \n", - " n_term = None\n", - " c_term = None\n", - " \n", - " i = 0\n", - " n = len(sequence)\n", - " \n", - " positions = []\n", - " state = BEFORE\n", - " depth = 0\n", - " \n", - " current_aa = None\n", - " current_tag = TagParser()\n", - " current_interval = None\n", - " \n", - " while i < n:\n", - " c = sequence[i]\n", - " i += 1\n", - " if state == BEFORE:\n", - " if c == '[':\n", - " state = TAG_BEFORE\n", - " depth = 1\n", - " elif c == '{':\n", - " state = LABILE\n", - " depth = 1\n", - " elif c == '<':\n", - " state = FIXED\n", - " elif c in VALID_AA:\n", - " current_aa = c\n", - " state = SEQ\n", - " else:\n", - " raise Exception(\"Error In State {state}, unexpected {c} found at index {i}\".format(**locals()))\n", - " elif state == SEQ:\n", - " if c in VALID_AA:\n", - " positions.append((current_aa, current_tag.process() if current_tag else None))\n", - " current_aa = c\n", - " elif c == '[':\n", - " state = TAG\n", - " depth = 1\n", - " elif c == '(':\n", - " current_interval = TaggedInterval(len(positions) + 1)\n", - " elif c == ')':\n", - " if current_interval is None:\n", - " raise Exception(\"Error In State {state}, unexpected {c} found at index {i}\".format(**locals()))\n", - " else:\n", - " current_interval.end = len(positions) + 1\n", - " if i >= n or sequence[i] != '[':\n", - " raise Exception(\"Missing Interval Tag\")\n", - " i += 1\n", - " depth = 1\n", - " state = INTERVAL_TAG\n", - " elif c == '-':\n", - " state = TAG_AFTER\n", - " if i >= n or sequence[i] != '[':\n", - " raise Exception(\"Missing Interval Tag\")\n", - " i += 1\n", - " depth = 1 \n", - " else:\n", - " raise Exception(\"Error In State {state}, unexpected {c} found at index {i}\".format(**locals()))\n", - " elif state == TAG or state == TAG_BEFORE or state == TAG_AFTER or state == GLOBAL:\n", - " if c == '[':\n", - " depth += 1\n", - " elif c == ']':\n", - " depth -= 1\n", - " if depth <= 0:\n", - " depth = 0\n", - " if state == TAG: \n", - " state = SEQ\n", - " elif state == TAG_BEFORE:\n", - " if i < n:\n", - " cnext = sequence[i]\n", - " if cnext == '?':\n", - " unlocalized_modifications.append(current_tag.process())\n", - " i += 1\n", - " elif cnext == '-':\n", - " n_term = current_tag.process()\n", - " i += 1\n", - " else:\n", - " i += 1\n", - " raise Exception(\"Error In State {state}, unexpected {cnext} found at index {i}\".format(**locals()))\n", - "\n", - " state = BEFORE\n", - " elif state == TAG_AFTER:\n", - " c_term = current_tag.process()\n", - " state = DONE\n", - " elif state == GLOBAL:\n", - " # Gobble the rest of the global tag inline to avoid spawning\n", - " # a whole new state.\n", - " if i < n:\n", - " c = sequence[i]\n", - " i += 1\n", - " if c != '@':\n", - " raise Exception(\n", - " (\"Error In State {state}, fixed modification detected without \"\n", - " \"target amino acids found at index {i}\").format(**locals()))\n", - " end = 0\n", - " targets = []\n", - " while i < n:\n", - " c = sequence[i]\n", - " i += 1\n", - " if c in VALID_AA:\n", - " targets.append(c)\n", - " elif c == ',':\n", - " pass\n", - " elif '>':\n", - " break\n", - " else:\n", - " raise Exception(\n", - " (\"Error In State {state}, unclosed fixed modification rule\").format(**locals()))\n", - "\n", - " fixed_modifications.append(\n", - " ModificationRule(current_tag.process(), targets))\n", - " state = BEFORE\n", - " else:\n", - " current_tag.append(c)\n", - " elif state == FIXED:\n", - " if c == '[':\n", - " state = GLOBAL\n", - " else:\n", - " state = ISOTOPE\n", - " current_tag.append(c)\n", - " elif state == ISOTOPE:\n", - " if c != '>':\n", - " current_tag.append(c)\n", - " else:\n", - " isotopes.append(StableIsotope(''.join(current_tag)))\n", - " current_tag.reset()\n", - " state = BEFORE\n", - " elif state == LABILE:\n", - " if c == '{':\n", - " depth += 1\n", - " elif c == '}':\n", - " depth -= 1\n", - " if depth <= 0:\n", - " depth = 0\n", - " labile_modifications.append(current_tag.process())\n", - " state = BEFORE\n", - " else:\n", - " current_tag.append(c)\n", - " elif state == INTERVAL_TAG:\n", - " if c == '[':\n", - " depth += 1\n", - " elif c == ']':\n", - " depth -= 1\n", - " if depth <= 0:\n", - " depth = 0\n", - " current_interval.tag = current_tag.process()\n", - " intervals.append(current_interval)\n", - " current_interval = None\n", - " state = SEQ\n", - " else:\n", - " current_tag.append(c)\n", - " else:\n", - " raise Exception(\"Error In State {state}, unexpected {c} found at index {i}\".format(**locals()))\n", - " if state in (ISOTOPE, TAG, TAG_AFTER, TAG_BEFORE, LABILE, ):\n", - " raise Exception(\"Error In State {state}, unclosed group reached end of string!\".format(**locals()))\n", - " if current_aa:\n", - " positions.append((current_aa, current_tag.process() if current_tag else None))\n", - " return positions, {\n", - " 'n_term': n_term,\n", - " 'c_term': c_term,\n", - " 'unlocalized_modifications': unlocalized_modifications,\n", - " 'labile_modifications': labile_modifications,\n", - " 'fixed_modifications': fixed_modifications,\n", - " 'intervals': intervals,\n", - " 'isotopes': isotopes,\n", - " 'group_ids': list(current_tag.group_ids)\n", - " }" - ] + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "{'mass': 841.2962353162999,\n", + " 'composition': Composition({'C': 29, 'H': 51, 'O': 22, 'N': 3}),\n", + " 'name': 'Hex 1 HexNAc 2 NeuAc 1',\n", + " 'monosaccharides': BasicComposition({'Hex': 1, 'HexNAc': 2, 'NeuAc': 1})}" + ] + }, + "metadata": {}, + "execution_count": 12 + } + ], + "source": [] }, { "cell_type": "code", - "execution_count": 231, + "execution_count": 120, "metadata": {}, "outputs": [ { @@ -836,7 +128,7 @@ "data": { "text/plain": [ "([('S', None),\n", - " ('T', UnimodModification('Ox', None, None)),\n", + " ('T', UnimodModification('Oxidation', None, None)),\n", " ('E', None),\n", " ('P', None),\n", " ('P', None),\n", @@ -845,26 +137,66 @@ " ('G', None)],\n", " {'n_term': GenericModification('Hex', None, None),\n", " 'c_term': None,\n", - " 'unlocalized_modifications': [GenericModification('Bar', None, None)],\n", - " 'labile_modifications': [GenericModification('Foo', None, None)],\n", + " 'unlocalized_modifications': [GenericModification('Hydroxyl', None, None)],\n", + " 'labile_modifications': [GenericModification('HexNAc', None, None)],\n", " 'fixed_modifications': [ModificationRule(GenericModification('Carbamidomethyl', None, None), ['C'])],\n", - " 'intervals': [TaggedInterval(2, 5, 18.0000)],\n", + " 'intervals': [TaggedInterval(2, 5, +18.15)],\n", " 'isotopes': [StableIsotope(13C)],\n", " 'group_ids': []})" ] }, "metadata": {}, - "execution_count": 231 + "execution_count": 120 } ], "source": [ - "seq, fields = tokenize_proforma(\"<[Carbamidomethyl]@C><13C>[Bar]?{Foo}[Hex]-ST[U:Ox](EPP)[+18]ING\")\n", + "seq, fields = tokenize_proforma(\"<[Carbamidomethyl]@C><13C>[Hydroxyl]?{HexNAc}[Hex]-ST[U:Oxidation](EPP)[+18.15]ING\")\n", "seq, fields" ] }, { "cell_type": "code", - "execution_count": 232, + "execution_count": 121, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'<[Carbamidomethyl]@C><13C>[Hydroxyl]?{HexNAc}[Hex]-ST[UNIMOD:Oxidation](EPP)[+18.15]ING'" + ] + }, + "metadata": {}, + "execution_count": 121 + } + ], + "source": [ + "format_proforma(seq, **fields)" + ] + }, + { + "cell_type": "code", + "execution_count": 119, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'1010100.00001'" + ] + }, + "metadata": {}, + "execution_count": 119 + } + ], + "source": [ + "'1010100.00001'.rstrip('0').rstrip('.')" + ] + }, + { + "cell_type": "code", + "execution_count": 67, "metadata": {}, "outputs": [ { @@ -889,7 +221,7 @@ ] }, "metadata": {}, - "execution_count": 232 + "execution_count": 67 } ], "source": [ @@ -899,7 +231,7 @@ }, { "cell_type": "code", - "execution_count": 233, + "execution_count": 69, "metadata": {}, "outputs": [ { @@ -928,11 +260,11 @@ ] }, "metadata": {}, - "execution_count": 233 + "execution_count": 69 } ], "source": [ - "parse_proforma(\"[Phospho#s1]?EM[Oxidation]EVT[#s1(0.01)]S[#s1(0.09)]ES[#s1(0.90)]PEK\")" + "tokenize_proforma(\"[Phospho#s1]?EM[Oxidation]EVT[#s1(0.01)]S[#s1(0.09)]ES[#s1(0.90)]PEK\")" ] }, { @@ -1023,9 +355,13 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" + "name": "python3", + "display_name": "Python 3.8.5 64-bit ('py38': conda)", + "metadata": { + "interpreter": { + "hash": "486495e7f81c8f11fe15f00929ebabe524f3fb730012655e6ba0fbdcd165e71e" + } + } }, "language_info": { "codemirror_mode": { From ed2cacf1949096ccdf1af4ffcd087e5f16997972 Mon Sep 17 00:00:00 2001 From: Joshua Klein Date: Tue, 19 Jan 2021 08:46:54 -0500 Subject: [PATCH 07/27] Fix out-of-order monosaccharide formulae;Add support for the Obs tag;Flesh out the Generic modification resolver; --- pyteomics/proforma.py | 94 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 74 insertions(+), 20 deletions(-) diff --git a/pyteomics/proforma.py b/pyteomics/proforma.py index 6b4b39c1..e15b1a29 100644 --- a/pyteomics/proforma.py +++ b/pyteomics/proforma.py @@ -36,12 +36,11 @@ # Python 2 doesn't have a builtin Enum type Enum = object -from six import add_metaclass from pyteomics import parser -from pyteomics.mass import Composition, std_aa_mass +from pyteomics.mass import Composition, std_aa_mass, Unimod from pyteomics.auxiliary import PyteomicsError, BasicComposition -from pyteomics.mass import Unimod +from pyteomics.auxiliary.utils import add_metaclass # To eventually be implemented with pyteomics port? try: @@ -56,6 +55,10 @@ def _needs_psims(name): obo_cache = None +std_aa_mass = std_aa_mass.copy() +std_aa_mass['X'] = 0 + + class ProFormaError(PyteomicsError): def __init__(self, message, index=None, parser_state=None, **kwargs): super(ProFormaError, self).__init__(PyteomicsError, message, index, parser_state) @@ -142,7 +145,7 @@ def __str__(self): else: label = part if self.group_id: - label = '%s|%s' % (label, self.group_id) + label = '%s%s' % (label, self.group_id) return '%s' % label def __repr__(self): @@ -167,6 +170,10 @@ def find_extra(self, label): out.append(e) return out + @classmethod + def parse(cls, buffer): + return process_tag_tokens(buffer) + class GroupLabelBase(TagBase): __slots__ = () @@ -189,8 +196,9 @@ class PositionLabelTag(GroupLabelBase): def __init__(self, value=None, extra=None, group_id=None): assert group_id is not None + value = group_id super(PositionLabelTag, self).__init__( - TagTypeEnum.position_label, group_id, extra, group_id) + TagTypeEnum.position_label, value, extra, group_id) def _format_main(self): return "{self.group_id}".format(self=self) @@ -230,17 +238,24 @@ class MassModification(TagBase): The value of a :class:`MassModification` is always a :class:`float` ''' - __slots__ = () + __slots__ = ('_significant_figures', ) + + prefix_name = "Obs" def __init__(self, value, extra=None, group_id=None): + if isinstance(value, str): + sigfigs = len(value.split('.')[-1].rstrip('0')) + else: + sigfigs = 4 + self._significant_figures = sigfigs super(MassModification, self).__init__( TagTypeEnum.massmod, float(value), extra, group_id) def _format_main(self): if self.value >= 0: - return ('+%0.4g' % self.value).rstrip('0').rstrip('.') + return ('+{0:0.{1}f}'.format(self.value, self._significant_figures)).rstrip('0').rstrip('.') else: - return ('%0.4g' % self.value).rstrip('0').rstrip('.') + return ('{0:0.{1}f}'.format(self.value, self._significant_figures)).rstrip('0').rstrip('.') @property def mass(self): @@ -248,7 +263,7 @@ def mass(self): class ModificationResolver(object): - def __init__(self, name, *args, **kwargs): + def __init__(self, name, **kwargs): self.name = name self._database = None @@ -269,8 +284,8 @@ def __call__(self, name=None, id=None, **kwargs): class UnimodResolver(ModificationResolver): - def __init__(self, *args, **kwargs): - super(UnimodResolver, self).__init__("unimod", *args, **kwargs) + def __init__(self, **kwargs): + super(UnimodResolver, self).__init__("unimod", **kwargs) self._database = kwargs.get("database") def load_database(self): @@ -299,8 +314,8 @@ def resolve(self, name=None, id=None, **kwargs): class PSIModResolver(ModificationResolver): - def __init__(self, *args, **kwargs): - super(PSIModResolver, self).__init__('psimod', *args, **kwargs) + def __init__(self, **kwargs): + super(PSIModResolver, self).__init__('psimod', **kwargs) self._database = kwargs.get("database") def load_database(self): @@ -325,8 +340,8 @@ def resolve(self, name=None, id=None, **kwargs): class XLMODResolver(ModificationResolver): - def __init__(self, *args, **kwargs): - super(XLMODResolver, self).__init__('xlmod', *args, **kwargs) + def __init__(self, **kwargs): + super(XLMODResolver, self).__init__('xlmod', **kwargs) self._database = kwargs.get("database") def load_database(self): @@ -353,16 +368,43 @@ def resolve(self, name=None, id=None, **kwargs): 'provider': self.name } - +# TODO: Implement resolve walking up the graph to get the mass. Can't really +# get any more information without glypy/glyspace interaction class GNOResolver(ModificationResolver): - def __init__(self, *args, **kwargs): - super(GNOResolver, self).__init__('gnome', *args, **kwargs) + def __init__(self, **kwargs): + super(GNOResolver, self).__init__('gnome', **kwargs) self._database = kwargs.get("database") def load_database(self): return load_gno() +class GenericResolver(ModificationResolver): + + def __init__(self, resolvers, **kwargs): + super(GenericResolver, self).__init__('generic', **kwargs) + self.resolvers = list(resolvers) + + def load_database(self): + return None + + def resolve(self, name=None, id=None, **kwargs): + defn = None + for resolver in self.resolvers: + try: + defn = resolver(name=name, id=id, **kwargs) + except (KeyError): + continue + if defn is None: + if name is None: + raise KeyError(id) + elif id is None: + raise KeyError(name) + else: + raise ValueError("Must provide one of `name` or `id`") + return defn + + class ModificationBase(TagBase): '''A base class for all modification tags with marked prefixes. ''' @@ -451,8 +493,8 @@ class GlycanModification(ModificationBase): valid_monosaccharides = { "Hex": (162.0528, Composition("C6H10O5")), - "HexNAc": (203.0793, Composition("C6H13N1O5")), - "HexS": (242.009, Composition("C8H10O8S1")), + "HexNAc": (203.0793, Composition("C8H13N1O5")), + "HexS": (242.009, Composition("C6H10O8S1")), "HexP": (242.0191, Composition("C6H11O8P1")), "HexNAcS": (283.0361, Composition("C8H13N1O8S1")), "dHex": (146.0579, Composition("C6H10O4")), @@ -464,6 +506,10 @@ class GlycanModification(ModificationBase): tokenizer = re.compile(r"([A-Za-z]+)\s*(\d*)\s*") + @property + def monosaccharides(self): + return self.definition.get('monosaccharides') + def resolve(self): composite = BasicComposition() for tok, cnt in self.tokenizer.findall(self.value): @@ -511,6 +557,8 @@ class PSIModModification(ModificationBase): class GNOmeModification(ModificationBase): __slots__ = () + resolver = GNOResolver() + prefix_name = "GNO" # short_prefix = 'G' _tag_type = TagTypeEnum.gnome @@ -529,6 +577,12 @@ class XLMODModification(ModificationBase): class GenericModification(ModificationBase): __slots__ = () _tag_type = TagTypeEnum.generic + resolver = GenericResolver([ + UnimodModification.resolver, + PSIModModification.resolver, + XLMODModification.resolver, + GNOmeModification.resolver, + ]) def __init__(self, value, extra=None, group_id=None): super(GenericModification, self).__init__( From 527b8209f4fb5dcb6a50120a7c8d8e05472943ad Mon Sep 17 00:00:00 2001 From: Joshua Klein Date: Tue, 23 Mar 2021 13:48:12 -0400 Subject: [PATCH 08/27] Add multimod example --- proforma_parsing.ipynb | 44 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 42 insertions(+), 2 deletions(-) diff --git a/proforma_parsing.ipynb b/proforma_parsing.ipynb index dfca8348..f457d854 100644 --- a/proforma_parsing.ipynb +++ b/proforma_parsing.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -22,7 +22,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -34,6 +34,46 @@ "format_proforma = proforma.to_proforma" ] }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "([('P', None),\n", + " ('A', None),\n", + " ('R', None),\n", + " ('S', GenericModification('PhosphoPhospho', None, None)),\n", + " ('E', None),\n", + " ('R', None)],\n", + " {'n_term': None,\n", + " 'c_term': None,\n", + " 'unlocalized_modifications': [],\n", + " 'labile_modifications': [],\n", + " 'fixed_modifications': [],\n", + " 'intervals': [],\n", + " 'isotopes': [],\n", + " 'group_ids': []})" + ] + }, + "metadata": {}, + "execution_count": 6 + } + ], + "source": [ + "proforma.parse_proforma(\"PARS[Phospho][Phospho]ER\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": 41, From 4afde59668e43c9f30253e3a1ee3e25e4496ac11 Mon Sep 17 00:00:00 2001 From: Joshua Klein Date: Sun, 28 Mar 2021 22:48:57 -0400 Subject: [PATCH 09/27] Prepping for draft PR --- proforma_parsing.ipynb | 421 ----------------------------------------- pyteomics/proforma.py | 24 ++- 2 files changed, 22 insertions(+), 423 deletions(-) delete mode 100644 proforma_parsing.ipynb diff --git a/proforma_parsing.ipynb b/proforma_parsing.ipynb deleted file mode 100644 index f457d854..00000000 --- a/proforma_parsing.ipynb +++ /dev/null @@ -1,421 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "import re\n", - "from collections import namedtuple, defaultdict\n", - "\n", - "try:\n", - " from enum import Enum\n", - "except ImportError:\n", - " # Python 2 doesn't have a builtin Enum type\n", - " Enum = object\n", - "\n", - "from six import add_metaclass\n", - "\n", - "from pyteomics import parser" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "import importlib\n", - "from pyteomics import proforma\n", - "importlib.reload(proforma)\n", - "\n", - "tokenize_proforma = proforma.parse_proforma\n", - "format_proforma = proforma.to_proforma" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "([('P', None),\n", - " ('A', None),\n", - " ('R', None),\n", - " ('S', GenericModification('PhosphoPhospho', None, None)),\n", - " ('E', None),\n", - " ('R', None)],\n", - " {'n_term': None,\n", - " 'c_term': None,\n", - " 'unlocalized_modifications': [],\n", - " 'labile_modifications': [],\n", - " 'fixed_modifications': [],\n", - " 'intervals': [],\n", - " 'isotopes': [],\n", - " 'group_ids': []})" - ] - }, - "metadata": {}, - "execution_count": 6 - } - ], - "source": [ - "proforma.parse_proforma(\"PARS[Phospho][Phospho]ER\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [], - "source": [ - "seq, props = proforma.parse_proforma(\"{Glycan:Hex 1 HexNAc 2 NeuAc 1}STYGIAN\")" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [], - "source": [ - "p = proforma.ProForma.parse(\"{Glycan:Hex1HexNAc2NeuAc1#g1}S[#g1]T[#g1]YGIANS[#g1]EQ\")" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "metadata": {}, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "{'n_term': None,\n", - " 'c_term': None,\n", - " 'unlocalized_modifications': [],\n", - " 'labile_modifications': [GlycanModification('Hex1HexNAc2NeuAc1', None, None)],\n", - " 'fixed_modifications': [],\n", - " 'intervals': [],\n", - " 'isotopes': [],\n", - " 'group_ids': []}" - ] - }, - "metadata": {}, - "execution_count": 48 - } - ], - "source": [ - "p.properties" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "1" - ] - }, - "metadata": {}, - "execution_count": 25 - } - ], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "{'mass': 841.2962353162999,\n", - " 'composition': Composition({'C': 29, 'H': 51, 'O': 22, 'N': 3}),\n", - " 'name': 'Hex 1 HexNAc 2 NeuAc 1',\n", - " 'monosaccharides': BasicComposition({'Hex': 1, 'HexNAc': 2, 'NeuAc': 1})}" - ] - }, - "metadata": {}, - "execution_count": 12 - } - ], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 120, - "metadata": {}, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "([('S', None),\n", - " ('T', UnimodModification('Oxidation', None, None)),\n", - " ('E', None),\n", - " ('P', None),\n", - " ('P', None),\n", - " ('I', None),\n", - " ('N', None),\n", - " ('G', None)],\n", - " {'n_term': GenericModification('Hex', None, None),\n", - " 'c_term': None,\n", - " 'unlocalized_modifications': [GenericModification('Hydroxyl', None, None)],\n", - " 'labile_modifications': [GenericModification('HexNAc', None, None)],\n", - " 'fixed_modifications': [ModificationRule(GenericModification('Carbamidomethyl', None, None), ['C'])],\n", - " 'intervals': [TaggedInterval(2, 5, +18.15)],\n", - " 'isotopes': [StableIsotope(13C)],\n", - " 'group_ids': []})" - ] - }, - "metadata": {}, - "execution_count": 120 - } - ], - "source": [ - "seq, fields = tokenize_proforma(\"<[Carbamidomethyl]@C><13C>[Hydroxyl]?{HexNAc}[Hex]-ST[U:Oxidation](EPP)[+18.15]ING\")\n", - "seq, fields" - ] - }, - { - "cell_type": "code", - "execution_count": 121, - "metadata": {}, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "'<[Carbamidomethyl]@C><13C>[Hydroxyl]?{HexNAc}[Hex]-ST[UNIMOD:Oxidation](EPP)[+18.15]ING'" - ] - }, - "metadata": {}, - "execution_count": 121 - } - ], - "source": [ - "format_proforma(seq, **fields)" - ] - }, - { - "cell_type": "code", - "execution_count": 119, - "metadata": {}, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "'1010100.00001'" - ] - }, - "metadata": {}, - "execution_count": 119 - } - ], - "source": [ - "'1010100.00001'.rstrip('0').rstrip('.')" - ] - }, - { - "cell_type": "code", - "execution_count": 67, - "metadata": {}, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "([('S', None),\n", - " ('E', None),\n", - " ('P', None),\n", - " ('P', None),\n", - " ('I', None),\n", - " ('N', None),\n", - " ('G', None)],\n", - " {'n_term': None,\n", - " 'c_term': None,\n", - " 'unlocalized_modifications': [],\n", - " 'labile_modifications': [],\n", - " 'fixed_modifications': [],\n", - " 'intervals': [TaggedInterval(1, 4, 18.0000)],\n", - " 'isotopes': [],\n", - " 'group_ids': []})" - ] - }, - "metadata": {}, - "execution_count": 67 - } - ], - "source": [ - "seq, fields = tokenize_proforma(\"S(EPP)[+18]ING\")\n", - "seq, fields" - ] - }, - { - "cell_type": "code", - "execution_count": 69, - "metadata": {}, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "([('E', None),\n", - " ('M', GenericModification('Oxidation', None, None)),\n", - " ('E', None),\n", - " ('V', None),\n", - " ('T', LocalizationMarker(0.01, None, '#s1')),\n", - " ('S', LocalizationMarker(0.09, None, '#s1')),\n", - " ('E', None),\n", - " ('S', LocalizationMarker(0.9, None, '#s1')),\n", - " ('P', None),\n", - " ('E', None),\n", - " ('K', None)],\n", - " {'n_term': None,\n", - " 'c_term': None,\n", - " 'unlocalized_modifications': [GenericModification('Phospho', [], '#s1')],\n", - " 'labile_modifications': [],\n", - " 'fixed_modifications': [],\n", - " 'intervals': [],\n", - " 'isotopes': [],\n", - " 'group_ids': ['#s1']})" - ] - }, - "metadata": {}, - "execution_count": 69 - } - ], - "source": [ - "tokenize_proforma(\"[Phospho#s1]?EM[Oxidation]EVT[#s1(0.01)]S[#s1(0.09)]ES[#s1(0.90)]PEK\")" - ] - }, - { - "cell_type": "code", - "execution_count": 234, - "metadata": {}, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "([('E', None),\n", - " ('M', GenericModification('Oxidation', None, None)),\n", - " ('E', None),\n", - " ('V', None),\n", - " ('T', LocalizationMarker(0.01, None, '#g1')),\n", - " ('S', LocalizationMarker(0.09, None, '#g1')),\n", - " ('E', None),\n", - " ('S',\n", - " GenericModification('Phospho', [LocalizationMarker(0.9, None, '#g1')], '#g1')),\n", - " ('P', None),\n", - " ('E', None),\n", - " ('K', None)],\n", - " {'n_term': None,\n", - " 'c_term': None,\n", - " 'unlocalized_modifications': [],\n", - " 'labile_modifications': [],\n", - " 'fixed_modifications': [],\n", - " 'intervals': [],\n", - " 'isotopes': [],\n", - " 'group_ids': ['#g1']})" - ] - }, - "metadata": {}, - "execution_count": 234 - } - ], - "source": [ - "tokenize_proforma(\"EM[Oxidation]EVT[#g1(0.01)]S[#g1(0.09)]ES[Phospho#g1(0.90)]PEK\")" - ] - }, - { - "cell_type": "code", - "execution_count": 235, - "metadata": {}, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "([('E', None),\n", - " ('M', None),\n", - " ('E', None),\n", - " ('V', None),\n", - " ('T', LocalizationMarker(0.01, None, '#g1')),\n", - " ('S', LocalizationMarker(0.09, None, '#g1')),\n", - " ('E', None),\n", - " ('S',\n", - " GlycanModification('HexNAc 1', [LocalizationMarker(0.9, None, '#g1')], '#g1')),\n", - " ('P', None),\n", - " ('E', None),\n", - " ('K', None)],\n", - " {'n_term': None,\n", - " 'c_term': None,\n", - " 'unlocalized_modifications': [],\n", - " 'labile_modifications': [],\n", - " 'fixed_modifications': [],\n", - " 'intervals': [],\n", - " 'isotopes': [],\n", - " 'group_ids': ['#g1']})" - ] - }, - "metadata": {}, - "execution_count": 235 - } - ], - "source": [ - "tokenize_proforma(\"EMEVT[#g1(0.01)]S[#g1(0.09)]ES[Glycan:HexNAc 1#g1(0.90)]PEK\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "name": "python3", - "display_name": "Python 3.8.5 64-bit ('py38': conda)", - "metadata": { - "interpreter": { - "hash": "486495e7f81c8f11fe15f00929ebabe524f3fb730012655e6ba0fbdcd165e71e" - } - } - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.5-final" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} \ No newline at end of file diff --git a/pyteomics/proforma.py b/pyteomics/proforma.py index e15b1a29..c0fe755d 100644 --- a/pyteomics/proforma.py +++ b/pyteomics/proforma.py @@ -1,6 +1,18 @@ -'''A simple ProForma lexer +''' +proforma - Proteoform and Peptidoform Notation +============================================== + +ProForma is a notation for defining modified amino acid sequences using +a set of controlled vocabularies, as well as encoding uncertain or partial +information about localization. See `ProForma specification `_ +for more up-to-date information. + +Strictly speaking, this implementation supports ProForma v2. + +Data Access +----------- -The primary interface is through :func:`parse_proforma`: +:py:func:`parse_proforma` - The primary interface for parsing ProForma strings. >>> parse_proforma("EM[Oxidation]EVT[#g1(0.01)]S[#g1(0.09)]ES[Phospho#g1(0.90)]PEK") ([('E', None), @@ -24,6 +36,14 @@ 'isotopes': [], 'group_ids': ['#g1']}) +:py:func:`to_proforma` - Format a sequence and set of properties as ProForma text. + + +Classes +------- + +:py:class:`ProForma` - An object oriented version of the parsing and formatting code, +coupled with minimal information about mass and position data. ''' import re From 34d36dbece0a870400241f20b7e16aa73a40dc5f Mon Sep 17 00:00:00 2001 From: Joshua Klein Date: Sun, 23 May 2021 23:09:56 -0400 Subject: [PATCH 10/27] Add support for multiple tags per position, add tests, and fix some lingering issues. --- pyteomics/proforma.py | 281 +++++++++++++++++++++++++++++++++-------- tests/test_proforma.py | 65 ++++++++++ 2 files changed, 294 insertions(+), 52 deletions(-) create mode 100644 tests/test_proforma.py diff --git a/pyteomics/proforma.py b/pyteomics/proforma.py index c0fe755d..49df3ed0 100644 --- a/pyteomics/proforma.py +++ b/pyteomics/proforma.py @@ -47,6 +47,7 @@ ''' import re +import warnings from collections import namedtuple, defaultdict, deque from functools import partial @@ -175,18 +176,35 @@ def __repr__(self): def __eq__(self, other): if other is None: return False + if isinstance(other, str): + return str(self) == other return (self.type == other.type) and (self.value == other.value) and (self.extra == other.extra) \ and (self.group_id == other.group_id) def __ne__(self, other): return not self == other - def find_extra(self, label): + def find_tag_type(self, tag_type): + '''Search this tag or tag collection for elements with a particular + tag type and return them. + + Parameters + ---------- + tag_type : TagTypeEnum + A label from :class:`TagTypeEnum`, or an equivalent type. + + Returns + ------- + matches : list + The list of all tags in this object which match the requested tag type. + ''' out = [] + if self.type == tag_type: + out.append(self) if not self.extra: return out for e in self.extra: - if e.type == label: + if e.type == tag_type: out.append(e) return out @@ -307,15 +325,27 @@ class UnimodResolver(ModificationResolver): def __init__(self, **kwargs): super(UnimodResolver, self).__init__("unimod", **kwargs) self._database = kwargs.get("database") + self.strict = kwargs.get("strict", True) def load_database(self): return Unimod() def resolve(self, name=None, id=None, **kwargs): + strict = kwargs.get("strict", self.strict) + exhaustive = kwargs.get("exhaustive", True) if name is not None: - defn = self.database.by_title(name) + defn = self.database.by_title(name, strict=strict) if not defn: - defn = self.database.by_name(name) + defn = self.database.by_name(name, strict=strict) + if not defn and exhaustive and strict: + defn = self.database.by_title(name, strict=False) + if not defn: + defn = self.database.by_name(name, strict=False) + if defn and isinstance(defn, list): + warnings.warn( + "Multiple matches found for {!r} in Unimod, taking the first, {}.".format( + name, defn[0]['record_id'])) + defn = defn[0] if not defn: raise KeyError(name) elif id is not None: @@ -342,9 +372,9 @@ def load_database(self): return load_psimod() def resolve(self, name=None, id=None, **kwargs): - if name is None: + if name is not None: defn = self.database[name] - elif id is None: + elif id is not None: defn = self.database['MOD:{:05d}'.format(id)] else: raise ValueError("Must provide one of `name` or `id`") @@ -368,9 +398,9 @@ def load_database(self): return load_psimod() def resolve(self, name=None, id=None, **kwargs): - if name is None: + if name is not None: defn = self.database[name] - elif id is None: + elif id is not None: defn = self.database['XLMOD:{:05d}'.format(id)] else: raise ValueError("Must provide one of `name` or `id`") @@ -391,6 +421,8 @@ def resolve(self, name=None, id=None, **kwargs): # TODO: Implement resolve walking up the graph to get the mass. Can't really # get any more information without glypy/glyspace interaction class GNOResolver(ModificationResolver): + mass_pattern = re.compile(r"(\d+(:?\.\d+)) Da") + def __init__(self, **kwargs): super(GNOResolver, self).__init__('gnome', **kwargs) self._database = kwargs.get("database") @@ -398,6 +430,38 @@ def __init__(self, **kwargs): def load_database(self): return load_gno() + def get_mass_from_term(self, term): + root_id = 'GNO:00000001' + parent = term.parent() + if isinstance(parent, list): + parent = parent[0] + while parent.id != root_id: + next_parent = term.parent() + if isinstance(next_parent, list): + next_parent = next_parent[0] + if next_parent.id == root_id: + break + parent = next_parent + match = self.mass_pattern.search(parent.name) + if not match: + return None + return float(match.group(1)) + + def resolve(self, name=None, id=None, **kwargs): + if name is not None: + term = self.database[name] + elif id is not None: + term = self.database[id] + else: + raise ValueError("Must provide one of `name` or `id`") + rec = { + "name":term.name, + "id": term.id, + "provider": self.name, + "composition": None, + "mass": self.get_mass_from_term(term) + } + class GenericResolver(ModificationResolver): @@ -598,10 +662,15 @@ class GenericModification(ModificationBase): __slots__ = () _tag_type = TagTypeEnum.generic resolver = GenericResolver([ - UnimodModification.resolver, + # Do exact matching here first. Then default to non-strict matching as a final + # correction effort. + partial(UnimodModification.resolver, exhaustive=False), PSIModModification.resolver, XLMODModification.resolver, GNOmeModification.resolver, + # Some really common names aren't actually found in the XML exactly, so default + # to non-strict matching now to avoid masking other sources here. + partial(UnimodModification.resolver, strict=False) ]) def __init__(self, value, extra=None, group_id=None): @@ -862,6 +931,9 @@ def __str__(self): def __repr__(self): return "{self.__class__.__name__}({self.start}, {self.end}, {self.tag})".format(self=self) + def as_slice(self): + return slice(self.start, self.end) + class TokenBuffer(object): '''A token buffer that wraps the accumulation and reset logic @@ -876,6 +948,7 @@ class TokenBuffer(object): ''' def __init__(self, initial=None): self.buffer = list(initial or []) + self.boundaries = [] def append(self, c): '''Append a new character to the buffer. @@ -890,7 +963,10 @@ def append(self, c): def reset(self): '''Discard the content of the current buffer. ''' - self.buffer = [] + if self.buffer: + self.buffer = [] + if self.boundaries: + self.boundaries = [] def __bool__(self): return bool(self.buffer) @@ -904,11 +980,31 @@ def __getitem__(self, i): def __len__(self): return len(self.buffer) + def tokenize(self): + i = 0 + pieces = [] + for k in self.boundaries + [len(self)]: + piece = self.buffer[i:k] + i = k + pieces.append(piece) + return pieces + + def _transform(self, value): + return value + def process(self): - value = self.buffer + if self.boundaries: + value = [self._transform(v) for v in self.tokenize()] + else: + value = self._transform(self.buffer) self.reset() return value + def bound(self): + k = len(self) + self.boundaries.append(k) + return k + def __call__(self): return self.process() @@ -924,10 +1020,9 @@ class NumberParser(TokenBuffer): buffer: list The list of tokens accumulated since the last parsing. ''' - def process(self): - value = int(''.join(self)) - self.reset() - return value + + def _transform(self, value): + return int(''.join(value)) class TagParser(TokenBuffer): @@ -951,20 +1046,18 @@ def __init__(self, initial=None, group_ids=None): else: self.group_ids = set() - def process(self): - '''Parse the content of the internal buffer, clear the buffer, - and return the parsed tag. - - Returns - ------- - TagBase - ''' - tag = process_tag_tokens(self.buffer) + def _transform(self, value): + tag = process_tag_tokens(value) if tag.group_id: self.group_ids.add(tag.group_id) - self.reset() return tag + def process(self): + value = super().process() + if not isinstance(value, list): + value = [value] + return value + class ParserStateEnum(Enum): before_sequence = 0 @@ -981,6 +1074,7 @@ class ParserStateEnum(Enum): unlocalized_count = 11 post_global = 12 post_global_aa = 13 + post_interval_tag = 14 done = 999 @@ -998,6 +1092,7 @@ class ParserStateEnum(Enum): UNLOCALIZED_COUNT = ParserStateEnum.unlocalized_count POST_GLOBAL = ParserStateEnum.post_global POST_GLOBAL_AA = ParserStateEnum.post_global_aa +POST_INTERVAL_TAG = ParserStateEnum.post_interval_tag DONE = ParserStateEnum.done VALID_AA = set("QWERTYIPASDFGHKLCVNM") @@ -1065,18 +1160,28 @@ def parse_proforma(sequence): "Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state) elif state == SEQ: if c in VALID_AA: - positions.append((current_aa, current_tag.process() if current_tag else None)) + positions.append((current_aa, current_tag() if current_tag else None)) current_aa = c elif c == '[': state = TAG + if current_tag: + current_tag.bound() depth = 1 elif c == '(': + if current_interval is not None: + raise ProFormaError( + ("Error In State {state}, nested range found at index {i}. " + "Nested ranges are not yet supported by ProForma.").format( + **locals()), i, state) current_interval = TaggedInterval(len(positions) + 1) elif c == ')': + positions.append( + (current_aa, current_tag() if current_tag else None)) + current_aa = None if current_interval is None: raise ProFormaError("Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state) else: - current_interval.end = len(positions) + 1 + current_interval.end = len(positions) if i >= n or sequence[i] != '[': raise ProFormaError("Missing Interval Tag", i, state) i += 1 @@ -1085,7 +1190,7 @@ def parse_proforma(sequence): elif c == '-': state = TAG_AFTER if i >= n or sequence[i] != '[': - raise ProFormaError("Missing Interval Tag", i, state) + raise ProFormaError("Missing Closing Tag", i, state) i += 1 depth = 1 else: @@ -1102,7 +1207,7 @@ def parse_proforma(sequence): elif state == TAG_BEFORE: state = POST_TAG_BEFORE elif state == TAG_AFTER: - c_term = current_tag.process() + c_term = current_tag() state = DONE elif state == GLOBAL: state = POST_GLOBAL @@ -1130,7 +1235,7 @@ def parse_proforma(sequence): depth -= 1 if depth <= 0: depth = 0 - labile_modifications.append(current_tag.process()) + labile_modifications.append(current_tag()[0]) state = BEFORE else: current_tag.append(c) @@ -1140,19 +1245,32 @@ def parse_proforma(sequence): elif c == ']': depth -= 1 if depth <= 0: + state = POST_INTERVAL_TAG depth = 0 - current_interval.tag = current_tag.process() - intervals.append(current_interval) - current_interval = None - state = SEQ else: current_tag.append(c) + elif state == POST_INTERVAL_TAG: + if c == '[': + current_tag.bound() + state = INTERVAL_TAG + elif c in VALID_AA: + current_aa = c + current_interval.tag = current_tag() + intervals.append(current_interval) + current_interval = None + state = SEQ + elif c == '-': + state = TAG_AFTER + if i >= n or sequence[i] != '[': + raise ProFormaError("Missing Closing Tag", i, state) + i += 1 + depth = 1 elif state == POST_TAG_BEFORE: if c == '?': - unlocalized_modifications.append(current_tag.process()) + unlocalized_modifications.append(current_tag()[0]) state = BEFORE elif c == '-': - n_term = current_tag.process() + n_term = current_tag() state = BEFORE elif c == '^': state = UNLOCALIZED_COUNT @@ -1165,14 +1283,14 @@ def parse_proforma(sequence): elif c == '[': state = TAG_BEFORE depth = 1 - tag = current_tag.process() - multiplicity = current_unlocalized_count.process() + tag = current_tag()[0] + multiplicity = current_unlocalized_count() for i in range(multiplicity): unlocalized_modifications.append(tag) elif c == '?': state = BEFORE - tag = current_tag.process() - multiplicity = current_unlocalized_count.process() + tag = current_tag()[0] + multiplicity = current_unlocalized_count() for i in range(multiplicity): unlocalized_modifications.append(tag) else: @@ -1190,7 +1308,7 @@ def parse_proforma(sequence): current_aa_targets.append(c) elif c == '>': fixed_modifications.append( - ModificationRule(current_tag.process(), current_aa_targets.process())) + ModificationRule(current_tag()[0], current_aa_targets())) state = BEFORE else: raise ProFormaError( @@ -1198,7 +1316,7 @@ def parse_proforma(sequence): else: raise ProFormaError("Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state) if current_aa: - positions.append((current_aa, current_tag.process() if current_tag else None)) + positions.append((current_aa, current_tag() if current_tag else None)) if state in (ISOTOPE, TAG, TAG_AFTER, TAG_BEFORE, LABILE, ): raise ProFormaError("Error In State {state}, unclosed group reached end of string!".format(**locals()), i, state) return positions, { @@ -1244,15 +1362,22 @@ def to_proforma(sequence, n_term=None, c_term=None, unlocalized_modifications=No ------- str ''' - primary = deque(['{0!s}[{1!s}]'.format(*p) if p[1] else p[0] for p in sequence]) + primary = deque() + for aa, tags in sequence: + if not tags: + primary.append(str(aa)) + else: + primary.append(str(aa) + ''.join(['[{0!s}]'.format(t) for t in tags])) if intervals: for iv in sorted(intervals, key=lambda x: x.start): primary[iv.start] = '(' + primary[iv.start] - primary[iv.end - 1] = '{0!s})[{1!s}]'.format(primary[iv.end - 1], iv.tag) + + primary[iv.end - 1] = '{0!s})'.format( + primary[iv.end - 1]) + ''.join('[{!s}]'.format(t) for t in iv.tag) if n_term: - primary.appendleft("[{!s}]-".format(n_term)) + primary.appendleft(''.join("[{!s}]".format(t) for t in n_term) + '-') if c_term: - primary.append('-[{!s}]'.format(c_term)) + primary.append('-' + ''.join("[{!s}]".format(t) for t in c_term)) if labile_modifications: primary.extendleft(['{{{!s}}}'.format(m) for m in labile_modifications]) if unlocalized_modifications: @@ -1278,10 +1403,23 @@ def __repr__(self): def __getitem__(self, i): if isinstance(i, slice): + props = self.properties.copy() + return self.__class__(self.sequence[i], self.properties) else: return self.sequence[i] + def __eq__(self, other): + if isinstance(other, str): + return str(self) == other + elif other is None: + return False + else: + return self.sequence == other.sequence and self.properties == other.properties + + def __ne__(self, other): + return not self == other + @classmethod def parse(cls, string): return cls(*parse_proforma(string)) @@ -1301,16 +1439,29 @@ def mass(self): mass += std_aa_mass[aa] if aa in fixed_rules: mass += fixed_rules[aa] - tag = position[1] - if tag: - try: - mass += tag.mass - except (AttributeError, KeyError): - continue + tags = position[1] + if tags: + for tag in tags: + try: + mass += tag.mass + except (AttributeError, KeyError): + continue for mod in self.properties['labile_modifications']: mass += mod.mass for mod in self.properties['unlocalized_modifications']: mass += mod.mass + if self.properties.get('n_term'): + for mod in self.properties['n_term']: + try: + mass += mod.mass + except (AttributeError, KeyError): + continue + if self.properties.get('c_term'): + for mod in self.properties['c_term']: + try: + mass += mod.mass + except (AttributeError, KeyError): + continue for iv in self.properties['intervals']: try: mass += iv.tag.mass @@ -1318,3 +1469,29 @@ def mass(self): continue return mass + def find_tags_by_id(self, tag_id, include_position=True): + if not tag_id.startswith("#"): + tag_id = "#" + tag_id + if tag_id not in self.properties['group_ids']: + return [] + matches = [] + for i, (_token, tags) in enumerate(self.sequence): + if tags: + for tag in tags: + if tag.group_id == tag_id: + if include_position: + matches.append((i, tag)) + else: + matches.append(tag) + for iv in self.properties['intervals']: + if iv.tag.group_id == tag_id: + matches.append((iv, iv.tag) if include_position else iv.tag) + for ulmod in self.properties['unlocalized_modifications']: + if ulmod.group_id == tag_id: + matches.append(('unlocalized_modifications', ulmod) + if include_position else ulmod) + for lamod in self.properties['labile_modifications']: + if lamod.group_id == tag_id: + matches.append(('labile_modifications', lamod) + if include_position else lamod) + return matches diff --git a/tests/test_proforma.py b/tests/test_proforma.py new file mode 100644 index 00000000..b488c3cb --- /dev/null +++ b/tests/test_proforma.py @@ -0,0 +1,65 @@ + +from ast import parse +from os import path +import unittest +import pyteomics +pyteomics.__path__ = [path.abspath( + path.join(path.dirname(__file__), path.pardir, 'pyteomics'))] +from pyteomics import proforma +from pyteomics.proforma import ( + ProForma, TaggedInterval, parse_proforma, MassModification, + ModificationRule, StableIsotope, GenericModification, to_proforma, + ) + + +class ProFormaTest(unittest.TestCase): + maxDiff = None + + def test_complicated_short(self): + complicated_short = r"<[Carbamidomethyl]@C><13C>[Hydroxylation]?{HexNAc}[Hex]-ST[UNIMOD:Oxidation](EPP)[+18.15]ING" + tokens, properties = parse_proforma(complicated_short) + assert len(tokens) == 8 + assert len(properties['n_term']) == 1 + assert properties['n_term'][0] == 'Hex' + assert len(properties['intervals']) == 1 + assert properties['intervals'][0] == TaggedInterval(2, 5, [MassModification(18.15)]) + assert len(properties['isotopes']) == 1 + assert properties['isotopes'][0] == StableIsotope("13C") + assert properties['fixed_modifications'][0] == ModificationRule( + GenericModification('Carbamidomethyl', None, None), ['C']) + assert to_proforma(tokens, **properties) == complicated_short + self.assertAlmostEqual(ProForma(tokens, properties).mass, 1192.498297, 3) + + + def test_ranges(self): + seq = "PRQT(EQC[Carbamidomethyl]FQRMS)[+19.0523]ISK" + parsed = proforma.ProForma.parse(seq) + assert str(parsed) == seq + + def test_error_on_nested_range(self): + self.assertRaises(proforma.ProFormaError, lambda: parse_proforma( + "PRQT(EQ(CFQR)[Carbamidomethyl]MS)[+19.0523]ISK")) + + def test_localization_scores(self): + seq = "EM[Oxidation]EVT[#g1(0.01)]S[#g1(0.09)]ES[Phospho#g1(0.90)]PEK" + obj = ProForma.parse(seq) + tags = obj.find_tags_by_id("#g1") + solutions = {4: 0.01, 5: 0.09, 7: 0.9} + for i, tag in tags: + marker = tag.find_tag_type(proforma.TagTypeEnum.localization_marker)[0] + expected = solutions[i] + assert expected == marker.value + + def test_multiple_info(self): + i = proforma.ProForma.parse( + "ELVIS[Phospho|INFO:newly discovered|info:really awesome]K") + tags = i[4][1][0].find_tag_type(proforma.TagTypeEnum.info) + messages = set(['newly discovered', 'really awesome']) + assert len(tags) == 2 + for tag in tags: + messages.remove(tag.value) + assert len(messages) == 0 + + +if __name__ == '__main__': + unittest.main() From 35b46588569e07868a8f5ec41d3efb09fb4ecc3c Mon Sep 17 00:00:00 2001 From: Joshua Klein Date: Sun, 23 May 2021 23:14:34 -0400 Subject: [PATCH 11/27] No f-strings --- pyteomics/proforma.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyteomics/proforma.py b/pyteomics/proforma.py index 49df3ed0..89cbc187 100644 --- a/pyteomics/proforma.py +++ b/pyteomics/proforma.py @@ -602,7 +602,7 @@ def resolve(self): else: cnt = 1 if tok not in self.valid_monosaccharides: - raise ValueError(f"{tok!r} is not a valid monosaccharide name") + raise ValueError("{tok!r} is not a valid monosaccharide name".format(**locals())) composite[tok] += cnt mass = 0 chemcomp = Composition() From feda7d0ff4b38306e1586d832c7c51e9e3f588db Mon Sep 17 00:00:00 2001 From: Joshua Klein Date: Sun, 23 May 2021 23:17:47 -0400 Subject: [PATCH 12/27] Use explicit super --- pyteomics/proforma.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyteomics/proforma.py b/pyteomics/proforma.py index 89cbc187..291e9c3f 100644 --- a/pyteomics/proforma.py +++ b/pyteomics/proforma.py @@ -1053,7 +1053,7 @@ def _transform(self, value): return tag def process(self): - value = super().process() + value = super(TagParser, self).process() if not isinstance(value, list): value = [value] return value From 8be8fc597ed469b461784ae14ef83d692a06b2d9 Mon Sep 17 00:00:00 2001 From: Joshua Klein Date: Sun, 23 May 2021 23:23:51 -0400 Subject: [PATCH 13/27] Add unknown amino acid --- pyteomics/proforma.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyteomics/proforma.py b/pyteomics/proforma.py index 291e9c3f..2cbf6f85 100644 --- a/pyteomics/proforma.py +++ b/pyteomics/proforma.py @@ -1095,7 +1095,7 @@ class ParserStateEnum(Enum): POST_INTERVAL_TAG = ParserStateEnum.post_interval_tag DONE = ParserStateEnum.done -VALID_AA = set("QWERTYIPASDFGHKLCVNM") +VALID_AA = set("QWERTYIPASDFGHKLCVNMX") def parse_proforma(sequence): '''Tokenize a ProForma sequence into a sequence of amino acid+tag positions, and a From 5508775aa222c4163128e15462495dfb9d4b876a Mon Sep 17 00:00:00 2001 From: Joshua Klein Date: Thu, 27 May 2021 11:53:26 -0400 Subject: [PATCH 14/27] Fix terminal masses --- pyteomics/proforma.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pyteomics/proforma.py b/pyteomics/proforma.py index 2cbf6f85..fc55fb11 100644 --- a/pyteomics/proforma.py +++ b/pyteomics/proforma.py @@ -46,6 +46,7 @@ coupled with minimal information about mass and position data. ''' +from pyteomics.mass.mass import calculate_mass import re import warnings from collections import namedtuple, defaultdict, deque @@ -1456,12 +1457,15 @@ def mass(self): mass += mod.mass except (AttributeError, KeyError): continue + mass += calculate_mass(formula="H") if self.properties.get('c_term'): for mod in self.properties['c_term']: try: mass += mod.mass except (AttributeError, KeyError): continue + + mass += calculate_mass(formula="OH") for iv in self.properties['intervals']: try: mass += iv.tag.mass From 7caff0a2f2ce600867c0bf9abb008486abe41778 Mon Sep 17 00:00:00 2001 From: Joshua Klein Date: Thu, 27 May 2021 12:24:14 -0400 Subject: [PATCH 15/27] update test --- tests/test_proforma.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_proforma.py b/tests/test_proforma.py index b488c3cb..ed24eca3 100644 --- a/tests/test_proforma.py +++ b/tests/test_proforma.py @@ -28,7 +28,8 @@ def test_complicated_short(self): assert properties['fixed_modifications'][0] == ModificationRule( GenericModification('Carbamidomethyl', None, None), ['C']) assert to_proforma(tokens, **properties) == complicated_short - self.assertAlmostEqual(ProForma(tokens, properties).mass, 1192.498297, 3) + self.assertAlmostEqual( + ProForma(tokens, properties).mass, 1210.5088, 3) def test_ranges(self): From 2b9402b301adecf5065b5eff16f70544339cb6bb Mon Sep 17 00:00:00 2001 From: Joshua Klein Date: Mon, 31 May 2021 12:14:46 -0400 Subject: [PATCH 16/27] Fully support all the required additional amino acids --- pyteomics/mass/mass.py | 3 +++ pyteomics/proforma.py | 9 ++++++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/pyteomics/mass/mass.py b/pyteomics/mass/mass.py index d238b289..2348794e 100644 --- a/pyteomics/mass/mass.py +++ b/pyteomics/mass/mass.py @@ -394,6 +394,7 @@ def mass(self, **kwargs): 'G': Composition({'H': 3, 'C': 2, 'O': 1, 'N': 1}), 'H': Composition({'H': 7, 'C': 6, 'N': 3, 'O': 1}), 'I': Composition({'H': 11, 'C': 6, 'O': 1, 'N': 1}), + 'J': Composition({'H': 11, 'C': 6, 'O': 1, 'N': 1}), 'K': Composition({'H': 12, 'C': 6, 'N': 2, 'O': 1}), 'L': Composition({'H': 11, 'C': 6, 'O': 1, 'N': 1}), 'M': Composition({'H': 9, 'C': 5, 'S': 1, 'O': 1, 'N': 1}), @@ -726,6 +727,7 @@ def isotopologues(*args, **kwargs): 'C': 103.00919, 'L': 113.08406, 'I': 113.08406, + 'J': 113.08406, 'N': 114.04293, 'D': 115.02694, 'Q': 128.05858, @@ -739,6 +741,7 @@ def isotopologues(*args, **kwargs): 'Y': 163.06333, 'W': 186.07931, 'O': 237.14773, + 'U': 168.065, } """A dictionary with monoisotopic masses of the twenty standard amino acid residues, selenocysteine and pyrrolysine. diff --git a/pyteomics/proforma.py b/pyteomics/proforma.py index fc55fb11..264273a3 100644 --- a/pyteomics/proforma.py +++ b/pyteomics/proforma.py @@ -69,7 +69,7 @@ from psims.controlled_vocabulary.controlled_vocabulary import (load_psimod, load_xlmod, load_gno, obo_cache) except ImportError: def _needs_psims(name): - raise ImportError("Loading %s requires the `psims` library. To access it, please install `psims" % name) + raise ImportError("Loading %s requires the `psims` library. To access it, please install `psims`" % name) load_psimod = partial(_needs_psims, 'PSIMOD') load_xlmod = partial(_needs_psims, 'XLMOD') @@ -1096,7 +1096,7 @@ class ParserStateEnum(Enum): POST_INTERVAL_TAG = ParserStateEnum.post_interval_tag DONE = ParserStateEnum.done -VALID_AA = set("QWERTYIPASDFGHKLCVNMX") +VALID_AA = set("QWERTYIPASDFGHKLCVNMXUOJZB") def parse_proforma(sequence): '''Tokenize a ProForma sequence into a sequence of amino acid+tag positions, and a @@ -1437,7 +1437,10 @@ def mass(self): for position in self.sequence: aa = position[0] - mass += std_aa_mass[aa] + try: + mass += std_aa_mass[aa] + except KeyError: + warnings.warn("%r does not have an exact mass" % (aa, )) if aa in fixed_rules: mass += fixed_rules[aa] tags = position[1] From 5f5166e6e8d67244d1cfbdbe536ed06fad066414 Mon Sep 17 00:00:00 2001 From: Joshua Klein Date: Mon, 31 May 2021 15:21:19 -0400 Subject: [PATCH 17/27] Remove duplicated undehydrated selenocysteine mass --- pyteomics/mass/mass.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pyteomics/mass/mass.py b/pyteomics/mass/mass.py index 2348794e..cb0777c3 100644 --- a/pyteomics/mass/mass.py +++ b/pyteomics/mass/mass.py @@ -741,7 +741,6 @@ def isotopologues(*args, **kwargs): 'Y': 163.06333, 'W': 186.07931, 'O': 237.14773, - 'U': 168.065, } """A dictionary with monoisotopic masses of the twenty standard amino acid residues, selenocysteine and pyrrolysine. From 5937299bdf9c4602d732e1768195f6b6f661ac39 Mon Sep 17 00:00:00 2001 From: Joshua Klein Date: Mon, 31 May 2021 16:05:22 -0400 Subject: [PATCH 18/27] Properly handle nested braces and isotopes --- pyteomics/proforma.py | 32 +++++++++++++++++++++++++++++--- tests/test_proforma.py | 5 +++++ 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/pyteomics/proforma.py b/pyteomics/proforma.py index 264273a3..f49bfd9f 100644 --- a/pyteomics/proforma.py +++ b/pyteomics/proforma.py @@ -558,12 +558,32 @@ def resolve(self): class FormulaModification(ModificationBase): prefix_name = "Formula" + isotope_pattern = re.compile(r'\[(?P\d+)(?P[A-Z][a-z]*)(?P[\-+]?\d+)\]') _tag_type = TagTypeEnum.formula + def _normalize_isotope_notation(self, match): + '''Rewrite ProForma isotope notation to Pyteomics-compatible + isotope notation. + + Parameters + ---------- + match : Match + The matched isotope notation string parsed by the regular expression. + + Returns + reformatted : str + The re-written isotope notation + ''' + parts = match.groupdict() + return "{element}[{isotope}]{quantity}".format(**parts) + def resolve(self): - # The handling of fixed isotopes is wrong here as Pyteomics uses a different - # convention. - composition = Composition(formula=''.join(self.value.split(" "))) + normalized = ''.join(self.value.split(" ")) + # If there is a [ character in the formula, we know there are isotopes which + # need to be normalized. + if '[' in normalized: + normalized = self.isotope_pattern.sub(self._normalize_isotope_notation, normalized) + composition = Composition(formula=normalized) return { "mass": composition.mass(), "composition": composition, @@ -1199,6 +1219,7 @@ def parse_proforma(sequence): elif state == TAG or state == TAG_BEFORE or state == TAG_AFTER or state == GLOBAL: if c == '[': depth += 1 + current_tag.append(c) elif c == ']': depth -= 1 if depth <= 0: @@ -1212,6 +1233,8 @@ def parse_proforma(sequence): state = DONE elif state == GLOBAL: state = POST_GLOBAL + else: + current_tag.append(c) else: current_tag.append(c) elif state == FIXED: @@ -1243,11 +1266,14 @@ def parse_proforma(sequence): elif state == INTERVAL_TAG: if c == '[': depth += 1 + current_tag.append(c) elif c == ']': depth -= 1 if depth <= 0: state = POST_INTERVAL_TAG depth = 0 + else: + current_tag.append(c) else: current_tag.append(c) elif state == POST_INTERVAL_TAG: diff --git a/tests/test_proforma.py b/tests/test_proforma.py index ed24eca3..36b9ea7f 100644 --- a/tests/test_proforma.py +++ b/tests/test_proforma.py @@ -61,6 +61,11 @@ def test_multiple_info(self): messages.remove(tag.value) assert len(messages) == 0 + def test_formula(self): + i = proforma.ProForma.parse("SEQUEN[Formula:[13C2]CH6N]CE") + mod = i[-3][1][0] + assert mod.composition == proforma.Composition( + {'H': 6, 'C[13]': 2, 'C': 1, 'N': 1}) if __name__ == '__main__': unittest.main() From 53c330af9a37b792211bf27eb626abe31ce93ca3 Mon Sep 17 00:00:00 2001 From: Joshua Klein Date: Mon, 31 May 2021 16:07:02 -0400 Subject: [PATCH 19/27] Update pyteomics/proforma.py Co-authored-by: Lev Levitsky --- pyteomics/proforma.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pyteomics/proforma.py b/pyteomics/proforma.py index f49bfd9f..2402aae6 100644 --- a/pyteomics/proforma.py +++ b/pyteomics/proforma.py @@ -44,6 +44,12 @@ :py:class:`ProForma` - An object oriented version of the parsing and formatting code, coupled with minimal information about mass and position data. + +Dependencies +------------ + +To resolve PSI-MOD, XL-MOD, and GNO identifiers, :mod:`psims` is required. + ''' from pyteomics.mass.mass import calculate_mass From e77ca1a8e81dc4add36eda5dac0beb10c47e8c38 Mon Sep 17 00:00:00 2001 From: Joshua Klein Date: Mon, 31 May 2021 16:12:03 -0400 Subject: [PATCH 20/27] Update pyteomics/proforma.py Co-authored-by: Lev Levitsky --- pyteomics/proforma.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyteomics/proforma.py b/pyteomics/proforma.py index 2402aae6..59357839 100644 --- a/pyteomics/proforma.py +++ b/pyteomics/proforma.py @@ -402,7 +402,7 @@ def __init__(self, **kwargs): self._database = kwargs.get("database") def load_database(self): - return load_psimod() + return load_xlmod() def resolve(self, name=None, id=None, **kwargs): if name is not None: From 25fde39864404aba2b1471df1faa9fb4e3031f38 Mon Sep 17 00:00:00 2001 From: Joshua Klein Date: Mon, 31 May 2021 17:20:40 -0400 Subject: [PATCH 21/27] Add compliance level to documentation --- pyteomics/proforma.py | 44 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/pyteomics/proforma.py b/pyteomics/proforma.py index 59357839..63185692 100644 --- a/pyteomics/proforma.py +++ b/pyteomics/proforma.py @@ -50,6 +50,50 @@ To resolve PSI-MOD, XL-MOD, and GNO identifiers, :mod:`psims` is required. + +Compliance Levels +----------------- + +1. Base Level Support +Represents the lowest level of compliance, this level involves providing support for: + + - [x] Amino acid sequences + - [x] Protein modifications using two of the supported CVs/ontologies: Unimod and PSI-MOD. + - [x] Protein modifications using delta masses (without prefixes) + - [x] N-terminal, C-terminal and labile modifications. + - [x] Ambiguity in the modification position, including support for localisation scores. + - [x] INFO tag. + +2. Additional Separate Support +These features are independent from each other: + + - [x] Unusual amino acids (O and U). + - [x] Ambiguous amino acids (e.g. X, B, Z). This would include support for sequence tags of known mass (using the character X). + - [x] Protein modifications using delta masses (using prefixes for the different CVs/ontologies). + - [x] Use of prefixes for Unimod (U:) and PSI-MOD (M:) names. + - [x] Support for the joint representation of experimental data and its interpretation. + +3. Top Down Extensions + + - [ ] Additional CV/ontologies for protein modifications: RESID (the prefix R MUST be used for RESID CV/ontology term names) + - [x] Chemical formulas (this feature occurs in two places in this list). + +4. Cross-Linking Extensions + + - [ ] Cross-linked peptides (using the XL-MOD CV/ontology, the prefix X MUST be used for XL-MOD CV/ontology term names). + +5. Glycan Extensions + + - [x] Additional CV/ontologies for protein modifications: GNO (the prefix G MUST be used for GNO CV/ontology term names) + - [x] Glycan composition. + - [x] Chemical formulas (this feature occurs in two places in this list). + +6. Spectral Support + + - [ ] Charge and chimeric spectra are special cases. + - [x] Global modifications (e.g., every C is C13). + + ''' from pyteomics.mass.mass import calculate_mass From da164db1c9877c85e10b4ae8b44cdf30256c6035 Mon Sep 17 00:00:00 2001 From: Joshua Klein Date: Wed, 2 Jun 2021 20:54:36 -0400 Subject: [PATCH 22/27] Fix up glycan mass calculation --- .github/workflows/pythonpackage.yml | 2 +- pyteomics/proforma.py | 37 ++++++++++++++++++++--------- tests/test_proforma.py | 17 +++++++++---- 3 files changed, 40 insertions(+), 16 deletions(-) diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml index 5ca06e87..a79d2011 100644 --- a/.github/workflows/pythonpackage.yml +++ b/.github/workflows/pythonpackage.yml @@ -26,7 +26,7 @@ jobs: run: | python -m pip install --upgrade pip pip install numpy - pip install lxml sqlalchemy pandas cython h5py hdf5plugin + pip install lxml sqlalchemy pandas cython h5py hdf5plugin psims pip install pynumpress - name: Run the tests run: | diff --git a/pyteomics/proforma.py b/pyteomics/proforma.py index 63185692..5c733037 100644 --- a/pyteomics/proforma.py +++ b/pyteomics/proforma.py @@ -12,9 +12,9 @@ Data Access ----------- -:py:func:`parse_proforma` - The primary interface for parsing ProForma strings. +:py:func:`parse` - The primary interface for parsing ProForma strings. - >>> parse_proforma("EM[Oxidation]EVT[#g1(0.01)]S[#g1(0.09)]ES[Phospho#g1(0.90)]PEK") + >>> parse("EM[Oxidation]EVT[#g1(0.01)]S[#g1(0.09)]ES[Phospho#g1(0.90)]PEK") ([('E', None), ('M', GenericModification('Oxidation', None, None)), ('E', None), @@ -99,7 +99,7 @@ from pyteomics.mass.mass import calculate_mass import re import warnings -from collections import namedtuple, defaultdict, deque +from collections import deque from functools import partial try: @@ -108,13 +108,10 @@ # Python 2 doesn't have a builtin Enum type Enum = object - -from pyteomics import parser from pyteomics.mass import Composition, std_aa_mass, Unimod from pyteomics.auxiliary import PyteomicsError, BasicComposition from pyteomics.auxiliary.utils import add_metaclass -# To eventually be implemented with pyteomics port? try: from psims.controlled_vocabulary.controlled_vocabulary import (load_psimod, load_xlmod, load_gno, obo_cache) except ImportError: @@ -126,6 +123,7 @@ def _needs_psims(name): load_gno = partial(_needs_psims, 'GNO') obo_cache = None +_water_mass = calculate_mass("H2O") std_aa_mass = std_aa_mass.copy() std_aa_mass['X'] = 0 @@ -481,13 +479,24 @@ def __init__(self, **kwargs): def load_database(self): return load_gno() + def get_mass_from_glycan_composition(self, term): + val = term.get('GNO:00000202') + if val: + tokens = re.findall(r"([A-Za-z0-9]+)\((\d+)\)", val) + mass = 0.0 + for symbol, count in tokens: + mass += GlycanModification.valid_monosaccharides[symbol][0] * int(count) + return mass + return None + def get_mass_from_term(self, term): + raw_mass = self.get_mass_from_glycan_composition(term) root_id = 'GNO:00000001' parent = term.parent() if isinstance(parent, list): parent = parent[0] while parent.id != root_id: - next_parent = term.parent() + next_parent = parent.parent() if isinstance(next_parent, list): next_parent = next_parent[0] if next_parent.id == root_id: @@ -496,7 +505,11 @@ def get_mass_from_term(self, term): match = self.mass_pattern.search(parent.name) if not match: return None - return float(match.group(1)) + # This will have a small mass error. + rough_mass = float(match.group(1)) - _water_mass + if abs(rough_mass - raw_mass) < 1: + return raw_mass + return rough_mass def resolve(self, name=None, id=None, **kwargs): if name is not None: @@ -512,6 +525,7 @@ def resolve(self, name=None, id=None, **kwargs): "composition": None, "mass": self.get_mass_from_term(term) } + return rec class GenericResolver(ModificationResolver): @@ -628,7 +642,7 @@ def _normalize_isotope_notation(self, match): return "{element}[{isotope}]{quantity}".format(**parts) def resolve(self): - normalized = ''.join(self.value.split(" ")) + normalized = self.value.replace(' ', '') # If there is a [ character in the formula, we know there are isotopes which # need to be normalized. if '[' in normalized: @@ -656,6 +670,7 @@ class GlycanModification(ModificationBase): "NeuAc": (291.0954, Composition("C11H17N1O8")), "NeuGc": (307.0903, Composition("C11H17N1O9")), "Pen": (132.0422, Composition("C5H8O4")), + "Pent": (132.0422, Composition("C5H8O4")), "Fuc": (146.0579, Composition("C6H10O4")) } @@ -1168,7 +1183,7 @@ class ParserStateEnum(Enum): VALID_AA = set("QWERTYIPASDFGHKLCVNMXUOJZB") -def parse_proforma(sequence): +def parse(sequence): '''Tokenize a ProForma sequence into a sequence of amino acid+tag positions, and a mapping of sequence-spanning modifiers. @@ -1499,7 +1514,7 @@ def __ne__(self, other): @classmethod def parse(cls, string): - return cls(*parse_proforma(string)) + return cls(*parse(string)) @property def mass(self): diff --git a/tests/test_proforma.py b/tests/test_proforma.py index 36b9ea7f..b19673ad 100644 --- a/tests/test_proforma.py +++ b/tests/test_proforma.py @@ -7,9 +7,9 @@ path.join(path.dirname(__file__), path.pardir, 'pyteomics'))] from pyteomics import proforma from pyteomics.proforma import ( - ProForma, TaggedInterval, parse_proforma, MassModification, + ProForma, TaggedInterval, parse, MassModification, ModificationRule, StableIsotope, GenericModification, to_proforma, - ) + obo_cache) class ProFormaTest(unittest.TestCase): @@ -17,7 +17,7 @@ class ProFormaTest(unittest.TestCase): def test_complicated_short(self): complicated_short = r"<[Carbamidomethyl]@C><13C>[Hydroxylation]?{HexNAc}[Hex]-ST[UNIMOD:Oxidation](EPP)[+18.15]ING" - tokens, properties = parse_proforma(complicated_short) + tokens, properties = parse(complicated_short) assert len(tokens) == 8 assert len(properties['n_term']) == 1 assert properties['n_term'][0] == 'Hex' @@ -38,7 +38,7 @@ def test_ranges(self): assert str(parsed) == seq def test_error_on_nested_range(self): - self.assertRaises(proforma.ProFormaError, lambda: parse_proforma( + self.assertRaises(proforma.ProFormaError, lambda: parse( "PRQT(EQ(CFQR)[Carbamidomethyl]MS)[+19.0523]ISK")) def test_localization_scores(self): @@ -67,5 +67,14 @@ def test_formula(self): assert mod.composition == proforma.Composition( {'H': 6, 'C[13]': 2, 'C': 1, 'N': 1}) + def test_gnome(self): + gp = proforma.ProForma.parse("NEEYN[GNO:G59626AS]K") + self.assertAlmostEqual(gp.mass, 2709.016, 3) + + def test_glycan(self): + gp = proforma.ProForma.parse("NEEYN[Glycan:Hex5HexNAc4NeuAc1]K") + self.assertAlmostEqual(gp.mass, 2709.016, 3) + + if __name__ == '__main__': unittest.main() From 803120759da9a2947f8f5cb1bcf23bc186c7c879 Mon Sep 17 00:00:00 2001 From: Joshua Klein Date: Wed, 2 Jun 2021 21:25:31 -0400 Subject: [PATCH 23/27] Fix slice behavior --- pyteomics/proforma.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pyteomics/proforma.py b/pyteomics/proforma.py index 5c733037..8e9566a2 100644 --- a/pyteomics/proforma.py +++ b/pyteomics/proforma.py @@ -1496,8 +1496,7 @@ def __repr__(self): def __getitem__(self, i): if isinstance(i, slice): props = self.properties.copy() - - return self.__class__(self.sequence[i], self.properties) + return self.__class__(self.sequence[i], props) else: return self.sequence[i] From 293b050d4899f651beb7d7cf1f3f25b4e272feba Mon Sep 17 00:00:00 2001 From: Joshua Klein Date: Sun, 6 Jun 2021 20:48:07 -0400 Subject: [PATCH 24/27] Simplify, more documentation --- pyteomics/proforma.py | 383 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 331 insertions(+), 52 deletions(-) diff --git a/pyteomics/proforma.py b/pyteomics/proforma.py index 8e9566a2..d88c3510 100644 --- a/pyteomics/proforma.py +++ b/pyteomics/proforma.py @@ -16,14 +16,14 @@ >>> parse("EM[Oxidation]EVT[#g1(0.01)]S[#g1(0.09)]ES[Phospho#g1(0.90)]PEK") ([('E', None), - ('M', GenericModification('Oxidation', None, None)), + ('M', [GenericModification('Oxidation', None, None)]), ('E', None), ('V', None), - ('T', LocalizationMarker(0.01, None, '#g1')), - ('S', LocalizationMarker(0.09, None, '#g1')), + ('T', [LocalizationMarker(0.01, None, '#g1')]), + ('S', [LocalizationMarker(0.09, None, '#g1')]), ('E', None), ('S', - GenericModification('Phospho', [LocalizationMarker(0.9, None, '#g1')], '#g1')), + [GenericModification('Phospho', [LocalizationMarker(0.9, None, '#g1')], '#g1')]), ('P', None), ('E', None), ('K', None)], @@ -48,7 +48,10 @@ Dependencies ------------ -To resolve PSI-MOD, XL-MOD, and GNO identifiers, :mod:`psims` is required. +To resolve PSI-MOD, XL-MOD, and GNO identifiers, :mod:`psims` is required. By default, +:mod:`psims` retrieves the most recent version of each ontology from the internet, but +includes a fall-back version to use when the network is unavailable. It can also create +an application cache on disk Compliance Levels @@ -90,7 +93,8 @@ 6. Spectral Support - - [ ] Charge and chimeric spectra are special cases. + - [x] Charge state and adducts + - [ ] Chimeric spectra are special cases. - [x] Global modifications (e.g., every C is C13). @@ -108,7 +112,7 @@ # Python 2 doesn't have a builtin Enum type Enum = object -from pyteomics.mass import Composition, std_aa_mass, Unimod +from pyteomics.mass import Composition, std_aa_mass, Unimod, nist_mass from pyteomics.auxiliary import PyteomicsError, BasicComposition from pyteomics.auxiliary.utils import add_metaclass @@ -128,6 +132,10 @@ def _needs_psims(name): std_aa_mass = std_aa_mass.copy() std_aa_mass['X'] = 0 +element_symbols = set(nist_mass) +element_symbols.remove("e*") +element_symbols.add('e') + class ProFormaError(PyteomicsError): def __init__(self, message, index=None, parser_state=None, **kwargs): @@ -480,17 +488,64 @@ def load_database(self): return load_gno() def get_mass_from_glycan_composition(self, term): + '''Parse the Byonic-style glycan composition from property GNO:00000202 + to get the counts of each monosaccharide and use that to calculate mass. + + The mass computed here is exact and dehydrated, distinct from the rounded-off + mass that :meth:`get_mass_from_term` will produce by walking up the CV term + hierarchy. However, not all glycan compositions are representable in GNO:00000202 + format, so this may silently be absent or incomplete, hence the double-check in + :meth:`get_mass_from_term`. + + Parameters + ---------- + term : psims.controlled_vocabulary.Entity + The CV entity being parsed. + + Returns + ------- + mass : float or :const:`None` + If a glycan composition is found on the term, the computed + mass will be returned. Otherwise the :const:`None` is returned + ''' val = term.get('GNO:00000202') + monosaccharides = BasicComposition() + composition = Composition() if val: tokens = re.findall(r"([A-Za-z0-9]+)\((\d+)\)", val) mass = 0.0 for symbol, count in tokens: - mass += GlycanModification.valid_monosaccharides[symbol][0] * int(count) - return mass - return None + count = int(count) + try: + mono_mass, mono_comp = GlycanModification.valid_monosaccharides[symbol] + mass += mono_mass * count + composition += mono_comp * count + monosaccharides[symbol] += count + except KeyError: + continue + return mass, monosaccharides, composition + return None, None, None + + def get_mass_from_term(self, term, raw_mass): + '''Walk up the term hierarchy and find the mass group + term near the root of the tree, and return the most accurate + mass available for the provided term. - def get_mass_from_term(self, term): - raw_mass = self.get_mass_from_glycan_composition(term) + The mass group term's mass is rounded to two decimal places, leading + to relatively large errors. + + Parameters + ---------- + term : psims.controlled_vocabulary.Entity + The CV entity being parsed. + + Returns + ------- + mass : float or :const:`None` + If a root node is found along the term's lineage, computed + mass will be returned. Otherwise the :const:`None` is returned. + The mass may be + ''' root_id = 'GNO:00000001' parent = term.parent() if isinstance(parent, list): @@ -507,8 +562,11 @@ def get_mass_from_term(self, term): return None # This will have a small mass error. rough_mass = float(match.group(1)) - _water_mass - if abs(rough_mass - raw_mass) < 1: + if raw_mass is not None and abs(rough_mass - raw_mass) < 1: return raw_mass + warnings.warn( + ("An accurate glycan composition could not be inferred from %s. " + "Only a rough approximation is available.") % (term, )) return rough_mass def resolve(self, name=None, id=None, **kwargs): @@ -518,12 +576,15 @@ def resolve(self, name=None, id=None, **kwargs): term = self.database[id] else: raise ValueError("Must provide one of `name` or `id`") + raw_mass, monosaccharides, composition = self.get_mass_from_glycan_composition(term) + rec = { "name":term.name, "id": term.id, "provider": self.name, - "composition": None, - "mass": self.get_mass_from_term(term) + "composition": composition, + "monosaccharides": monosaccharides, + "mass": self.get_mass_from_term(term, raw_mass) } return rec @@ -730,9 +791,13 @@ class GNOmeModification(ModificationBase): resolver = GNOResolver() prefix_name = "GNO" - # short_prefix = 'G' + short_prefix = 'G' _tag_type = TagTypeEnum.gnome + @property + def monosaccharides(self): + return self.definition.get('monosaccharides') + class XLMODModification(ModificationBase): __slots__ = () @@ -993,33 +1058,113 @@ class TaggedInterval(object): The starting position (inclusive) of the interval along the primary sequence end: int The ending position (exclusive) of the interval along the primary sequence - tag: TagBase - The tag being localized + tags: list[TagBase] + The tags being localized ''' - __slots__ = ('start', 'end', 'tag') + __slots__ = ('start', 'end', 'tags') - def __init__(self, start, end=None, tag=None): + def __init__(self, start, end=None, tags=None): self.start = start self.end = end - self.tag = tag + self.tags = tags def __eq__(self, other): if other is None: return False - return self.start == other.start and self.end == other.end and self.tag == other.tag + return self.start == other.start and self.end == other.end and self.tags == other.tags def __ne__(self, other): return not self == other def __str__(self): - return "({self.start}-{self.end}){self.tag!r}".format(self=self) + return "({self.start}-{self.end}){self.tags!r}".format(self=self) def __repr__(self): - return "{self.__class__.__name__}({self.start}, {self.end}, {self.tag})".format(self=self) + return "{self.__class__.__name__}({self.start}, {self.end}, {self.tags})".format(self=self) def as_slice(self): return slice(self.start, self.end) + def copy(self): + return self.__class__(self.start, self.end, self.tags) + + def _update_coordinates_sliced(self, start=None, end=None, warn_ambiguous=True): + if end is None: + qend = self.end + 1 + else: + qend = end + if start is None: + qstart = self.start - 1 + else: + qstart = start + + # Fully contained interval + valid = qstart <= self.start and qend >= self.end + + if not valid: + # Spans the beginning but not the end + valid = qstart <= self.start and qend > self.start + if valid and warn_ambiguous: + warnings.warn("Slice bisecting interval %s" % (self, )) + + if not valid: + # Spans the end but not the beginning + valid = qstart < self.end and qend > self.end + if valid and warn_ambiguous: + warnings.warn("Slice bisecting interval %s" % (self, )) + + if not valid: + # Contained interval + valid = qstart >= self.start and qend < self.end + if valid and warn_ambiguous: + warnings.warn("Slice bisecting interval %s" % (self, )) + + if not valid: + return None + new = self.copy() + if start is not None: + diff = self.start - start + if diff < 0: + diff = 0 + new.start = diff + if end is not None: + width = min(new.end, end) - self.start + else: + width = self.end - max(start, self.start) + new.end = new.start + width + return new + + +class ChargeState(object): + '''Describes the charge and adduct types of the structure. + + Attributes + ---------- + charge : int + The total charge state as a signed number. + adducts : list[str] + Each charge carrier associated with the molecule. + ''' + __slots__ = ("charge", "adducts") + + def __init__(self, charge, adducts=None): + if adducts is None: + adducts = [] + self.charge = charge + self.adducts = adducts + + def __str__(self): + tokens = [str(self.charge)] + if self.adducts: + tokens.append("[") + tokens.append(','.join(str(adduct) for adduct in self.adducts)) + tokens.append("]") + return ''.join(tokens) + + def __repr__(self): + template = "{self.__class__.__name__}({self.charge}, {self.adducts})" + return template.format(self=self) + class TokenBuffer(object): '''A token buffer that wraps the accumulation and reset logic @@ -1098,17 +1243,19 @@ def __call__(self): class NumberParser(TokenBuffer): '''A buffer which accumulates tokens until it is asked to parse them into :class:`int` instances. + ''' - Implements a subset of the Sequence protocol. + def _transform(self, value): + return int(''.join(value)) - Attributes - ---------- - buffer: list - The list of tokens accumulated since the last parsing. + +class StringParser(TokenBuffer): + '''A buffer which accumulates tokens until it is asked to parse them into + :class:`str` instances. ''' def _transform(self, value): - return int(''.join(value)) + return ''.join(value) class TagParser(TokenBuffer): @@ -1161,6 +1308,14 @@ class ParserStateEnum(Enum): post_global = 12 post_global_aa = 13 post_interval_tag = 14 + post_tag_after = 15 + charge_state_start = 16 + charge_state_number = 17 + charge_state_adduct_start = 18 + charge_state_adduct_end = 19 + inter_chain_cross_link_start = 20 + chimeric_start = 21 + done = 999 @@ -1175,10 +1330,15 @@ class ParserStateEnum(Enum): INTERVAL_TAG = ParserStateEnum.interval_tag TAG_AFTER = ParserStateEnum.tag_after_sequence POST_TAG_BEFORE = ParserStateEnum.post_tag_before +POST_TAG_AFTER = ParserStateEnum.post_tag_after UNLOCALIZED_COUNT = ParserStateEnum.unlocalized_count POST_GLOBAL = ParserStateEnum.post_global POST_GLOBAL_AA = ParserStateEnum.post_global_aa POST_INTERVAL_TAG = ParserStateEnum.post_interval_tag +CHARGE_START = ParserStateEnum.charge_state_start +CHARGE_NUMBER = ParserStateEnum.charge_state_number +ADDUCT_START = ParserStateEnum.charge_state_adduct_start +ADDUCT_END = ParserStateEnum.charge_state_adduct_end DONE = ParserStateEnum.done VALID_AA = set("QWERTYIPASDFGHKLCVNMXUOJZB") @@ -1198,7 +1358,7 @@ def parse(sequence): Returns ------- - parsed_sequence: list[tuple[str, TagBase]] + parsed_sequence: list[tuple[str, list[TagBase]]] The (amino acid: str, TagBase or None) pairs denoting the positions along the primary sequence modifiers: dict A mapping listing the labile modifications, fixed modifications, stable isotopes, unlocalized @@ -1226,9 +1386,15 @@ def parse(sequence): current_unlocalized_count = NumberParser() current_aa_targets = TokenBuffer() + charge_buffer = None + adduct_buffer = None + + # A mostly context free finite state machine unrolled + # by hand. while i < n: c = sequence[i] i += 1 + # Initial state prior to sequence content if state == BEFORE: if c == '[': state = TAG_BEFORE @@ -1244,6 +1410,7 @@ def parse(sequence): else: raise ProFormaError( "Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state) + # The body of the amino acid sequence. elif state == SEQ: if c in VALID_AA: positions.append((current_aa, current_tag() if current_tag else None)) @@ -1279,9 +1446,16 @@ def parse(sequence): raise ProFormaError("Missing Closing Tag", i, state) i += 1 depth = 1 + elif c == '/': + state = CHARGE_START + charge_buffer = NumberParser() + elif c == '+': + raise ProFormaError( + "Error In State {state}, {c} found at index {i}. Chimeric representation not supported".format(**locals()), i, state) else: raise ProFormaError("Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state) - elif state == TAG or state == TAG_BEFORE or state == TAG_AFTER or state == GLOBAL: + # Tag parsing which rely on `current_tag` to buffer tokens. + elif state == TAG or state == TAG_BEFORE or state == TAG_AFTER or state == GLOBAL or state == INTERVAL_TAG: if c == '[': depth += 1 current_tag.append(c) @@ -1295,20 +1469,26 @@ def parse(sequence): state = POST_TAG_BEFORE elif state == TAG_AFTER: c_term = current_tag() - state = DONE + state = POST_TAG_AFTER elif state == GLOBAL: state = POST_GLOBAL + elif state == INTERVAL_TAG: + state = POST_INTERVAL_TAG + depth = 0 else: current_tag.append(c) else: current_tag.append(c) + # Handle transition to fixed modifications or isotope labeling from opening signal. elif state == FIXED: if c == '[': state = GLOBAL else: # Do validation here state = ISOTOPE + current_tag.reset() current_tag.append(c) + # Handle fixed isotope rules, which rely on `current_tag` to buffer tokens elif state == ISOTOPE: if c != '>': current_tag.append(c) @@ -1317,6 +1497,7 @@ def parse(sequence): isotopes.append(StableIsotope(''.join(current_tag))) current_tag.reset() state = BEFORE + # Handle labile modifications, which rely on `current_tag` to buffer tokens elif state == LABILE: if c == '{': depth += 1 @@ -1328,26 +1509,18 @@ def parse(sequence): state = BEFORE else: current_tag.append(c) - elif state == INTERVAL_TAG: - if c == '[': - depth += 1 - current_tag.append(c) - elif c == ']': - depth -= 1 - if depth <= 0: - state = POST_INTERVAL_TAG - depth = 0 - else: - current_tag.append(c) - else: - current_tag.append(c) + # The intermediate state between an interval tag and returning to sequence parsing. + # A new tag may start immediately, leading to it being appended to the interval instead + # instead of returning to the primary sequence. Because this state may also occur at the + # end of a sequence, it must also handle sequence-terminal transitions like C-terminal tags, + # charge states, and the like. elif state == POST_INTERVAL_TAG: if c == '[': current_tag.bound() state = INTERVAL_TAG elif c in VALID_AA: current_aa = c - current_interval.tag = current_tag() + current_interval.tags = current_tag() intervals.append(current_interval) current_interval = None state = SEQ @@ -1357,6 +1530,17 @@ def parse(sequence): raise ProFormaError("Missing Closing Tag", i, state) i += 1 depth = 1 + elif c == '/': + state = CHARGE_START + charge_buffer = NumberParser() + elif c == '+': + raise ProFormaError( + "Error In State {state}, {c} found at index {i}. Chimeric representation not supported".format(**locals()), i, state) + else: + raise ProFormaError( + "Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state) + # An intermediate state for discriminating which type of tag-before-sequence type + # we just finished parsing. elif state == POST_TAG_BEFORE: if c == '?': unlocalized_modifications.append(current_tag()[0]) @@ -1405,8 +1589,57 @@ def parse(sequence): else: raise ProFormaError( ("Error In State {state}, unclosed fixed modification rule").format(**locals()), i, state) + elif state == POST_TAG_AFTER: + if c == '/': + state = CHARGE_START + charge_buffer = NumberParser() + elif c == '+': + raise ProFormaError( + "Error In State {state}, {c} found at index {i}. Chimeric representation not supported".format(**locals()), i, state) + elif state == CHARGE_START: + if c in '+-': + charge_buffer.append(c) + state = CHARGE_NUMBER + elif c.isdigit(): + charge_buffer.append(c) + state = CHARGE_NUMBER + elif c == '/': + state = ParserStateEnum.inter_chain_cross_link_start + raise ProFormaError("Inter-chain cross-linked peptides are not yet supported", i, state) + else: + raise ProFormaError( + "Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state) + elif state == CHARGE_NUMBER: + if c.isdigit(): + charge_buffer.append(c) + elif c == "[": + state = ADDUCT_START + adduct_buffer = StringParser() + else: + raise ProFormaError( + "Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state) + elif state == ADDUCT_START: + if c.isdigit() or c in "+-" or c in element_symbols: + adduct_buffer.append(c) + elif c == ',': + adduct_buffer.bound() + elif c == ']': + state = ADDUCT_END + elif state == ADDUCT_END: + if c == '+': + raise ProFormaError( + "Error In State {state}, {c} found at index {i}. Chimeric representation not supported".format(**locals()), i, state) else: raise ProFormaError("Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state) + if charge_buffer: + charge_number = charge_buffer() + if adduct_buffer: + adducts = adduct_buffer() + else: + adducts = None + charge_state = ChargeState(charge_number, adducts) + else: + charge_state = None if current_aa: positions.append((current_aa, current_tag() if current_tag else None)) if state in (ISOTOPE, TAG, TAG_AFTER, TAG_BEFORE, LABILE, ): @@ -1419,13 +1652,14 @@ def parse(sequence): 'fixed_modifications': fixed_modifications, 'intervals': intervals, 'isotopes': isotopes, - 'group_ids': sorted(current_tag.group_ids) + 'group_ids': sorted(current_tag.group_ids), + 'charge_state': charge_state } def to_proforma(sequence, n_term=None, c_term=None, unlocalized_modifications=None, labile_modifications=None, fixed_modifications=None, intervals=None, - isotopes=None, group_ids=None): + isotopes=None, charge_state=None, group_ids=None): '''Convert a sequence plus modifiers into formatted text following the ProForma specification. @@ -1447,6 +1681,8 @@ def to_proforma(sequence, n_term=None, c_term=None, unlocalized_modifications=No A list of modified intervals, if any isotopes : Optional[list[StableIsotope]] Any global stable isotope labels applied + charge_state : Optional[ChargeState] + An optional charge state value group_ids : Optional[list[str]] Any group identifiers. This parameter is currently not used. @@ -1465,11 +1701,13 @@ def to_proforma(sequence, n_term=None, c_term=None, unlocalized_modifications=No primary[iv.start] = '(' + primary[iv.start] primary[iv.end - 1] = '{0!s})'.format( - primary[iv.end - 1]) + ''.join('[{!s}]'.format(t) for t in iv.tag) + primary[iv.end - 1]) + ''.join('[{!s}]'.format(t) for t in iv.tags) if n_term: primary.appendleft(''.join("[{!s}]".format(t) for t in n_term) + '-') if c_term: primary.append('-' + ''.join("[{!s}]".format(t) for t in c_term)) + if charge_state: + primary.append("/{!s}".format(charge_state)) if labile_modifications: primary.extendleft(['{{{!s}}}'.format(m) for m in labile_modifications]) if unlocalized_modifications: @@ -1482,11 +1720,46 @@ def to_proforma(sequence, n_term=None, c_term=None, unlocalized_modifications=No return ''.join(primary) +class _ProFormaProperty(object): + def __init__(self, name): + self.name = name + + def __get__(self, obj, cls): + return obj.properties[self.name] + + def __set__(self, obj, value): + obj.properties[self.name] = value + + def __repr__(self): + template = "{self.__class__.__name__}({self.name!r})" + return template.format(self=self) + + class ProForma(object): + '''Represent a parsed ProForma sequence. + + Attributes + ---------- + sequence : list[tuple[]] + ''' + def __init__(self, sequence, properties): self.sequence = sequence self.properties = properties + isotopes = _ProFormaProperty('isotopes') + charge_state = _ProFormaProperty('charge_state') + + intervals = _ProFormaProperty('intervals') + fixed_modifications = _ProFormaProperty('fixed_modifications') + labile_modifications = _ProFormaProperty('labile_modifications') + unlocalized_modifications = _ProFormaProperty('unlocalized_modifications') + + n_term = _ProFormaProperty('n_term') + c_term = _ProFormaProperty('c_term') + + group_ids = _ProFormaProperty('group_ids') + def __str__(self): return to_proforma(self.sequence, **self.properties) @@ -1496,6 +1769,14 @@ def __repr__(self): def __getitem__(self, i): if isinstance(i, slice): props = self.properties.copy() + ivs = [] + for iv in props['intervals']: + iv = iv._update_coordinates_sliced( + i.start, i.stop) + if iv is None: + continue + ivs.append(iv) + props['intervals'] = ivs return self.__class__(self.sequence[i], props) else: return self.sequence[i] @@ -1569,8 +1850,6 @@ def mass(self): def find_tags_by_id(self, tag_id, include_position=True): if not tag_id.startswith("#"): tag_id = "#" + tag_id - if tag_id not in self.properties['group_ids']: - return [] matches = [] for i, (_token, tags) in enumerate(self.sequence): if tags: From 325088d68053ac5bdfaed7bb08e0009e26464d57 Mon Sep 17 00:00:00 2001 From: Joshua Klein Date: Sun, 13 Jun 2021 15:56:53 -0400 Subject: [PATCH 25/27] Add ambiguous sequence regions --- pyteomics/proforma.py | 100 ++++++++++++++++++++++++++++------------- tests/test_proforma.py | 11 ++++- 2 files changed, 79 insertions(+), 32 deletions(-) diff --git a/pyteomics/proforma.py b/pyteomics/proforma.py index d88c3510..67d08fa2 100644 --- a/pyteomics/proforma.py +++ b/pyteomics/proforma.py @@ -1048,9 +1048,17 @@ def __repr__(self): return "{self.__class__.__name__}({self.isotope})".format(self=self) +class IntersectionEnum(Enum): + no_overlap = 0 + full_contains_interval = 1 + full_contained_in_interval = 2 + start_overlap = 3 + end_overlap = 4 + + class TaggedInterval(object): '''Define a fixed interval over the associated sequence which contains the localization - of the associated tag. + of the associated tag or denotes a region of general sequence order ambiguity. Attributes ---------- @@ -1060,13 +1068,16 @@ class TaggedInterval(object): The ending position (exclusive) of the interval along the primary sequence tags: list[TagBase] The tags being localized + ambiguous : bool + Whether the interval is ambiguous or not ''' - __slots__ = ('start', 'end', 'tags') + __slots__ = ('start', 'end', 'tags', 'ambiguous') - def __init__(self, start, end=None, tags=None): + def __init__(self, start, end=None, tags=None, ambiguous=False): self.start = start self.end = end self.tags = tags + self.ambiguous = ambiguous def __eq__(self, other): if other is None: @@ -1088,37 +1099,48 @@ def as_slice(self): def copy(self): return self.__class__(self.start, self.end, self.tags) - def _update_coordinates_sliced(self, start=None, end=None, warn_ambiguous=True): - if end is None: - qend = self.end + 1 - else: - qend = end - if start is None: - qstart = self.start - 1 - else: - qstart = start - + def _check_slice(self, qstart, qend, warn_ambiguous): # Fully contained interval valid = qstart <= self.start and qend >= self.end - + case = IntersectionEnum.full_contained_in_interval if valid else IntersectionEnum.no_overlap if not valid: # Spans the beginning but not the end valid = qstart <= self.start and qend > self.start - if valid and warn_ambiguous: - warnings.warn("Slice bisecting interval %s" % (self, )) + if valid: + case = IntersectionEnum.start_overlap + if warn_ambiguous: + warnings.warn("Slice bisecting interval %s" % (self, )) if not valid: # Spans the end but not the beginning valid = qstart < self.end and qend > self.end - if valid and warn_ambiguous: - warnings.warn("Slice bisecting interval %s" % (self, )) + if valid: + case = IntersectionEnum.end_overlap + if warn_ambiguous: + warnings.warn("Slice bisecting interval %s" % (self, )) if not valid: # Contained interval valid = qstart >= self.start and qend < self.end - if valid and warn_ambiguous: - warnings.warn("Slice bisecting interval %s" % (self, )) + if valid: + case = IntersectionEnum.full_contains_interval + if warn_ambiguous: + warnings.warn("Slice bisecting interval %s" % (self, )) + return valid, case + + def _update_coordinates_sliced(self, start=None, end=None, warn_ambiguous=True): + if end is None: + qend = self.end + 1 + else: + qend = end + if start is None: + qstart = self.start - 1 + else: + qstart = start + valid, intersection_type = self._check_slice(qstart, qend, warn_ambiguous) + if self.ambiguous and intersection_type not in (IntersectionEnum.full_contained_in_interval, IntersectionEnum.no_overlap): + raise ValueError("Cannot bisect an ambiguous interval") if not valid: return None new = self.copy() @@ -1315,7 +1337,7 @@ class ParserStateEnum(Enum): charge_state_adduct_end = 19 inter_chain_cross_link_start = 20 chimeric_start = 21 - + interval_initial = 22 done = 999 @@ -1328,6 +1350,7 @@ class ParserStateEnum(Enum): SEQ = ParserStateEnum.sequence TAG = ParserStateEnum.tag_in_sequence INTERVAL_TAG = ParserStateEnum.interval_tag +INTERVAL_INIT = ParserStateEnum.interval_initial TAG_AFTER = ParserStateEnum.tag_after_sequence POST_TAG_BEFORE = ParserStateEnum.post_tag_before POST_TAG_AFTER = ParserStateEnum.post_tag_after @@ -1411,9 +1434,16 @@ def parse(sequence): raise ProFormaError( "Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state) # The body of the amino acid sequence. - elif state == SEQ: + elif state == SEQ or state == INTERVAL_INIT: + if state == INTERVAL_INIT: + state = SEQ + if c == '?': + if current_interval is not None: + current_interval.ambiguous = True + continue if c in VALID_AA: - positions.append((current_aa, current_tag() if current_tag else None)) + if current_aa is not None: + positions.append((current_aa, current_tag() if current_tag else None)) current_aa = c elif c == '[': state = TAG @@ -1427,6 +1457,7 @@ def parse(sequence): "Nested ranges are not yet supported by ProForma.").format( **locals()), i, state) current_interval = TaggedInterval(len(positions) + 1) + state = INTERVAL_INIT elif c == ')': positions.append( (current_aa, current_tag() if current_tag else None)) @@ -1435,11 +1466,13 @@ def parse(sequence): raise ProFormaError("Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state) else: current_interval.end = len(positions) - if i >= n or sequence[i] != '[': - raise ProFormaError("Missing Interval Tag", i, state) - i += 1 - depth = 1 - state = INTERVAL_TAG + if i < n and sequence[i] == '[': + i += 1 + depth = 1 + state = INTERVAL_TAG + else: + intervals.append(current_interval) + current_interval = None elif c == '-': state = TAG_AFTER if i >= n or sequence[i] != '[': @@ -1698,10 +1731,15 @@ def to_proforma(sequence, n_term=None, c_term=None, unlocalized_modifications=No primary.append(str(aa) + ''.join(['[{0!s}]'.format(t) for t in tags])) if intervals: for iv in sorted(intervals, key=lambda x: x.start): - primary[iv.start] = '(' + primary[iv.start] + if iv.ambiguous: + primary[iv.start] = '(?' + primary[iv.start] + else: + primary[iv.start] = '(' + primary[iv.start] - primary[iv.end - 1] = '{0!s})'.format( - primary[iv.end - 1]) + ''.join('[{!s}]'.format(t) for t in iv.tags) + terminator = '{0!s})'.format(primary[iv.end - 1]) + if iv.tags: + terminator += ''.join('[{!s}]'.format(t) for t in iv.tags) + primary[iv.end - 1] = terminator if n_term: primary.appendleft(''.join("[{!s}]".format(t) for t in n_term) + '-') if c_term: diff --git a/tests/test_proforma.py b/tests/test_proforma.py index b19673ad..e47684c1 100644 --- a/tests/test_proforma.py +++ b/tests/test_proforma.py @@ -32,10 +32,19 @@ def test_complicated_short(self): ProForma(tokens, properties).mass, 1210.5088, 3) - def test_ranges(self): + def test_range(self): seq = "PRQT(EQC[Carbamidomethyl]FQRMS)[+19.0523]ISK" parsed = proforma.ProForma.parse(seq) assert str(parsed) == seq + chunk = parsed[:6] + assert chunk.intervals + + def test_ambiguous_range(self): + seq = "PRQT(?EQC[Carbamidomethyl]FQRMS)ISK" + parsed = proforma.ProForma.parse(seq) + assert str(parsed) == seq + self.assertRaises(ValueError, lambda: parsed[:6]) + def test_error_on_nested_range(self): self.assertRaises(proforma.ProFormaError, lambda: parse( From c54476565af97e7fc92852bda4dfa58a41ac0a5a Mon Sep 17 00:00:00 2001 From: Joshua Klein Date: Sun, 13 Jun 2021 16:00:31 -0400 Subject: [PATCH 26/27] ProForma testing requires psims --- test-requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/test-requirements.txt b/test-requirements.txt index 42b523ab..1af7e4e9 100644 --- a/test-requirements.txt +++ b/test-requirements.txt @@ -8,3 +8,4 @@ h5py hdf5plugin < 3.0.0; python_version < '3' hdf5plugin; python_version > '3.1' pynumpress +psims \ No newline at end of file From 43fcebe5028c6aed1ac2eb0070b96f3907515601 Mon Sep 17 00:00:00 2001 From: Joshua Klein Date: Sun, 13 Jun 2021 16:01:28 -0400 Subject: [PATCH 27/27] ci --- pyteomics/proforma.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyteomics/proforma.py b/pyteomics/proforma.py index 67d08fa2..e6c432b0 100644 --- a/pyteomics/proforma.py +++ b/pyteomics/proforma.py @@ -1686,7 +1686,7 @@ def parse(sequence): 'intervals': intervals, 'isotopes': isotopes, 'group_ids': sorted(current_tag.group_ids), - 'charge_state': charge_state + 'charge_state': charge_state, }