Skip to content

Commit

Permalink
Added support for braced modifications
Browse files Browse the repository at this point in the history
  • Loading branch information
mobiusklein committed Jul 28, 2024
1 parent e840749 commit 917acbf
Show file tree
Hide file tree
Showing 6 changed files with 97 additions and 102 deletions.
25 changes: 22 additions & 3 deletions implementations/python/mzpaf/annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@

JSONDict = Dict[str, Union[List, Dict, int, float, str, bool, None]]

annotation_pattern = re.compile(r"""
annotation_pattern = re.compile(
r"""
^(?P<is_auxiliary>&)?
(?:(?P<analyte_reference>\d+)@)?
(?:(?:(?P<series>[axbycz]\.?)(?P<ordinal>\d+)(?:\{(?P<sequence_ordinal>.+)\})?)|
Expand All @@ -46,7 +47,7 @@
(?:(?:[A-Z][A-Za-z0-9]*)|
(?:\[
(?:
(?:[A-Za-z0-9:\.]+)
(?:[A-Za-z0-9:\.]+)(?:\[(?:[A-Za-z0-9\.:\-\ ]+)\])?
)
\])
)
Expand All @@ -56,7 +57,9 @@
(?:\^(?P<charge>[+-]?\d+))?
(?:/(?P<mass_error>[+-]?\d+(?:\.\d+)?)(?P<mass_error_unit>ppm)?)?
(?:\*(?P<confidence>\d*(?:\.\d+)?))?
""", re.X)
""",
re.X,
)

# At the time of first writing, this pattern could be translated into the equivalent
# ECMAScript compliant regex:
Expand Down Expand Up @@ -943,6 +946,22 @@ def _coerce_confidence(self, data: Dict[str, str]) -> float:
return confidence

def parse_annotation(self, annotation_string: str, **kwargs) -> List[IonAnnotationBase]:
"""
Parse a string into one or more :class:`IonAnnotationBase` instances.
Parameters
----------
annotation_string : str
The string to be parsed
**kwargs
Passed to the :meth:`_dispatch` which in turn creates :class:`IonAnnotationBase`
instances
Returns
-------
list[:class:`IonAnnotationBase`] :
The annotations parsed
"""
if not annotation_string:
return []

Expand Down
2 changes: 1 addition & 1 deletion specification/grammars/annotation.lark
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ CHARACTER : LETTER | DIGIT | SYMBOL

CONTENT_IN_BRACES : LETTER | DIGIT | SYMBOL_WITHOUT_CLOSING_BRACE | " "

BRACE_ENCLOSED_CONTENT : "[" (CONTENT_IN_BRACES)+ "]"
BRACE_ENCLOSED_CONTENT : "[" (CONTENT_IN_BRACES)+ ("[" (CONTENT_IN_BRACES | " ")+ "]")? "]"

ANALYTE_REFERENCE : (CHARACTER)+

Expand Down
157 changes: 65 additions & 92 deletions specification/grammars/grammar.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,18 @@
import string


from railroad import (Diagram, Choice, Group, Optional, Terminal,
NonTerminal, Sequence, OneOrMore, ZeroOrMore, Stack)
from railroad import (
Diagram,
Choice,
Group,
Optional,
Terminal,
NonTerminal,
Sequence,
OneOrMore,
ZeroOrMore,
Stack,
)
import io

from pyteomics.mass import std_aa_comp
Expand All @@ -25,7 +35,7 @@
ATOM_COUNT = Sequence(
NonTerminal("UPPER_CASE_LETTER"),
ZeroOrMore(NonTerminal("LOWER_CASE_LETTER")),
OneOrMore(NonTerminal("DIGIT"))
OneOrMore(NonTerminal("DIGIT")),
)

NUMBER = Sequence(
Expand All @@ -35,9 +45,9 @@
Sequence(
"e",
OneOrMore(NonTerminal("DIGIT")),
Optional(Sequence(".", OneOrMore(NonTerminal("DIGIT"))))
Optional(Sequence(".", OneOrMore(NonTerminal("DIGIT")))),
)
)
),
)

ORDINAL = OneOrMore(NonTerminal("DIGIT"))
Expand All @@ -46,7 +56,7 @@
0,
NonTerminal("DIGIT"),
NonTerminal("UPPER_CASE_LETTER"),
NonTerminal("LOWER_CASE_LETTER")
NonTerminal("LOWER_CASE_LETTER"),
)

AMINO_ACID = Choice(0, *list(std_aa_comp)[:-2])
Expand All @@ -56,7 +66,14 @@
BraceEnclosedContent = Sequence(
Terminal("["),
OneOrMore(Choice(0, NonTerminal("CHARACTER"), NonTerminal("SYMBOL"))),
Terminal("]")
Optional(
Sequence(
Terminal("["),
OneOrMore(Choice(0, NonTerminal("CHARACTER"), NonTerminal("SYMBOL"), Terminal(" "))),
Terminal("]"),
)
),
Terminal("]"),
)

IsAuxiliary = Group(Optional(Terminal("&")), "Is Auxiliary")
Expand All @@ -72,21 +89,15 @@
Optional(
Sequence(
Terminal("{"),
Group(OneOrMore(NonTerminal("ANY")), 'ProForma 2.0 Sequence'),
Terminal("}")
Group(OneOrMore(NonTerminal("ANY")), "ProForma 2.0 Sequence"),
Terminal("}"),
)
)
),
),
"Peptide Ion",
)

ReporterIon = Group(
Sequence(
Terminal("r"),
BraceEnclosedContent
),
"Reporter Ion"
)
ReporterIon = Group(Sequence(Terminal("r"), BraceEnclosedContent), "Reporter Ion")

InternalIon = Group(
Sequence(
Expand All @@ -96,9 +107,9 @@
NonTerminal("ORDINAL"),
Sequence(
Terminal("{"),
Group(NonTerminal("ANY"), 'ProForma 2.0 Sequence'),
Terminal("}")
)
Group(NonTerminal("ANY"), "ProForma 2.0 Sequence"),
Terminal("}"),
),
),
"Internal Peptide Ion",
)
Expand All @@ -108,48 +119,36 @@
"Immonium Ion",
)

PrecursorIon = Group(
Terminal("p"),
"Precursor Ion"
)
PrecursorIon = Group(Terminal("p"), "Precursor Ion")


ChemicalFormula = OneOrMore(NonTerminal('ATOM_COUNT'))
ChemicalFormula = OneOrMore(NonTerminal("ATOM_COUNT"))


FormulaIon = Group(
Sequence(
Terminal("f"),
Terminal('{'),
ChemicalFormula,
Terminal('}')
),
"Formula Ion"
Sequence(Terminal("f"), Terminal("{"), ChemicalFormula, Terminal("}")),
"Formula Ion",
)

NamedCompound = Group(
Sequence(
Terminal("_"),
Terminal('{'),
Terminal("{"),
OneOrMore(NonTerminal("CHARACTER")),
Terminal('}'),
Terminal("}"),
),
"Named Compound"
"Named Compound",
)

UnknownIon = Group(
Sequence(Terminal("?"), Optional(OneOrMore(NonTerminal("DIGIT")))),
"Unknown Ion"
Sequence(Terminal("?"), Optional(OneOrMore(NonTerminal("DIGIT")))), "Unknown Ion"
)

SMILESIon = Group(
Sequence(
Terminal("s"),
Terminal('{'),
OneOrMore(Terminal("/[^}]/")),
Terminal('}')
Terminal("s"), Terminal("{"), OneOrMore(Terminal("/[^}]/")), Terminal("}")
),
"SMILES Ion"
"SMILES Ion",
)

IonType = Group(
Expand All @@ -165,15 +164,12 @@
SMILESIon,
UnknownIon,
),
"Ion Type"
"Ion Type",
)

NeutralLoss = Group(
Sequence(
NonTerminal('SIGN'),
Choice(0, ChemicalFormula, BraceEnclosedContent)
),
"Neutral Loss(es)"
Sequence(NonTerminal("SIGN"), Choice(0, ChemicalFormula, BraceEnclosedContent)),
"Neutral Loss(es)",
)

Isotope = Group(
Expand All @@ -182,64 +178,41 @@
Optional(NonTerminal("ORDINAL")),
Terminal("i"),
),
"Isotope"
"Isotope",
)

ChargeState = Group(
Sequence(
"^",
NonTerminal("ORDINAL")
),
"Charge State"
)
ChargeState = Group(Sequence("^", NonTerminal("ORDINAL")), "Charge State")

Adducts = Group(
Sequence(
'[',
'M',
"[",
"M",
OneOrMore(
Sequence(
NonTerminal('SIGN'),
NonTerminal("SIGN"),
ChemicalFormula,
)
),
']'
"]",
),
"Adducts"
"Adducts",
)

MassError = Group(
Sequence(
'/',
NonTerminal("NUMBER"),
Optional("ppm")
),
"Mass Error"
)
MassError = Group(Sequence("/", NonTerminal("NUMBER"), Optional("ppm")), "Mass Error")

ConfidenceEstimate = Group(
Sequence(
"*",
NonTerminal("NUMBER")
),
"Confidence Estimate"
)
ConfidenceEstimate = Group(Sequence("*", NonTerminal("NUMBER")), "Confidence Estimate")


Annotation = (
Stack(
IsAuxiliary,
Optional(
AnalyteIdentifier
),
IonType,
ZeroOrMore(NeutralLoss),
Optional(Isotope),
Optional(Adducts),
Optional(ChargeState),
Optional(MassError),
Optional(ConfidenceEstimate),
)
Annotation = Stack(
IsAuxiliary,
Optional(AnalyteIdentifier),
IonType,
ZeroOrMore(NeutralLoss),
Optional(Isotope),
Optional(Adducts),
Optional(ChargeState),
Optional(MassError),
Optional(ConfidenceEstimate),
)


Expand All @@ -255,12 +228,12 @@ def render_group_to_file(fh, name):
print("Writing", name)
tokens = globals()[name]
pathname: pathlib.Path = (image_dir / name).with_suffix(".svg")
with pathname.open('wt') as img_fh:
with pathname.open("wt") as img_fh:
img_fh.write(encode_svg(Diagram(tokens)))
fh.write(f"""## {name}\n<img src="{pathname}">\n\n""")


with open("grammar.md", 'wt') as fh:
with open("grammar.md", "wt") as fh:
fh.write("""# Peak Annotation Grammar\n\n""")
render_group_to_file(fh, "DIGIT")
render_group_to_file(fh, "LOWER_CASE_LETTER")
Expand Down
2 changes: 1 addition & 1 deletion specification/grammars/regex_ecma.js

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

11 changes: 7 additions & 4 deletions specification/grammars/regex_sre.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import re

annotation_pattern = re.compile(r"""
annotation_pattern = re.compile(
r"""
^(?P<is_auxiliary>&)?
(?:(?P<analyte_reference>\d+)@)?
(?:(?:(?P<series>[axbycz]\.?)(?P<ordinal>\d+)(?:\{(?P<sequence_ordinal>.+)\})?)|
Expand All @@ -12,7 +13,7 @@
(?P<reference_label>[^\]]+)
\])
))|
(?:f\{(?P<formula>[A-Za-z0-9]+)\})|
(?:f\{(?P<formula>[A-Za-z0-9\[\]]+)\})|
(?:_\{
(?P<named_compound>[^\{\}\s,/]+)
\})|
Expand All @@ -23,7 +24,7 @@
(?:(?:[A-Z][A-Za-z0-9]*)|
(?:\[
(?:
(?:[A-Za-z0-9:\.]+)
(?:[A-Za-z0-9:\.]+)(?:\[(?:[A-Za-z0-9\.:\-\ ]+)\])?
)
\])
)
Expand All @@ -33,4 +34,6 @@
(?:\^(?P<charge>[+-]?\d+))?
(?:/(?P<mass_error>[+-]?\d+(?:\.\d+)?)(?P<mass_error_unit>ppm)?)?
(?:\*(?P<confidence>\d*(?:\.\d+)?))?
""", re.X)
""",
re.X,
)
2 changes: 1 addition & 1 deletion specification/grammars/schema_images/Annotation.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit 917acbf

Please sign in to comment.