Skip to content

Commit

Permalink
chg: improve efficiency of detecting expressions and last_saved
Browse files Browse the repository at this point in the history
- expression.py
  - remove is_single_token_expression and instead look for the token
    type of interest only. This requires making the lexer rules
    available outside of get_expression_lexer() and unfortunately
    compiling an extra time since the Scanner combines all regexes into
    one before compiling (so you can't give it pre-compiled regex).
  - change PYXFORM_REF regex to specifically look for optional
    `last-saved#` prefix instead of any ncname before a `#`. Not sure
    why it was like that but `last-saved` is quicker than ncname regex.
- pyxform_reference.py
  - add some sanity checks on input `value` to avoid more expensive
    parsing in the remaining function body
- survey.py
  - in _generate_last_saved_instance, combine the check performed on
    default/choice_filter/bind(items) into a func (in expression.py)
  - refactor check of bind items to more efficient method (per comment).
  • Loading branch information
lindsay-stevens committed Dec 18, 2024
1 parent fe32c6b commit 5979b9d
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 55 deletions.
60 changes: 29 additions & 31 deletions pyxform/parsing/expression.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,8 @@
import re
from collections.abc import Iterable
from functools import lru_cache


def get_expression_lexer(name_only: bool = False) -> re.Scanner:
"""
Get a expression lexer (scanner) for parsing.
"""
def get_lexer_rules():
# ncname regex adapted from eulxml https://github.com/emory-libraries/eulxml/blob/2e1a9f71ffd1fd455bd8326ec82125e333b352e0/eulxml/xpath/lexrules.py
# (C) 2010,2011 Emory University Libraries [Apache v2.0 License]
# They in turn adapted it from https://www.w3.org/TR/REC-xml/#NT-NameStartChar
Expand All @@ -29,7 +25,7 @@ def get_expression_lexer(name_only: bool = False) -> re.Scanner:
date_time_regex = date_regex + "T" + time_regex

# Rule order is significant - match priority runs top to bottom.
lexer_rules = {
return {
# https://www.w3.org/TR/xmlschema-2/#dateTime
"DATETIME": date_time_regex,
"DATE": date_regex,
Expand All @@ -49,7 +45,7 @@ def get_expression_lexer(name_only: bool = False) -> re.Scanner:
"SYSTEM_LITERAL": r""""[^"]*"|'[^']*'""",
"COMMA": r",",
"WHITESPACE": r"\s+",
"PYXFORM_REF": r"\$\{" + ncname_regex + r"(#" + ncname_regex + r")?" + r"\}",
"PYXFORM_REF": r"\$\{(last-saved#)?" + ncname_regex + r"\}",
"FUNC_CALL": ncname_regex + r"\(",
"XPATH_PRED_START": ncname_regex + r"\[",
"XPATH_PRED_END": r"\]",
Expand All @@ -60,15 +56,21 @@ def get_expression_lexer(name_only: bool = False) -> re.Scanner:
"OTHER": r".+?", # Catch any other character so that parsing doesn't stop.
}


LEXER_RULES = get_lexer_rules()
RE_ONLY_NCNAME = re.compile(rf"""^{LEXER_RULES["NAME"]}$""")
RE_ONLY_PYXFORM_REF = re.compile(rf"""^{LEXER_RULES["PYXFORM_REF"]}$""")
RE_ANY_PYXFORM_REF = re.compile(LEXER_RULES["PYXFORM_REF"])


def get_expression_lexer() -> re.Scanner:
def get_tokenizer(name):
def tokenizer(scan, value) -> ExpLexerToken | str:
if name_only:
return name
return ExpLexerToken(name, value, scan.match.start(), scan.match.end())

return tokenizer

lexicon = [(v, get_tokenizer(k)) for k, v in lexer_rules.items()]
lexicon = [(v, get_tokenizer(k)) for k, v in LEXER_RULES.items()]
# re.Scanner is undocumented but has been around since at least 2003
# https://mail.python.org/pipermail/python-dev/2003-April/035075.html
return re.Scanner(lexicon)
Expand All @@ -84,9 +86,8 @@ def __init__(self, name: str, value: str, start: int, end: int) -> None:
self.end: int = end


# Scanner takes a few 100ms to compile so use these shared instances.
# Scanner takes a few 100ms to compile so use the shared instance.
_EXPRESSION_LEXER = get_expression_lexer()
_TOKEN_NAME_LEXER = get_expression_lexer(name_only=True)


@lru_cache(maxsize=128)
Expand All @@ -103,32 +104,29 @@ def parse_expression(text: str) -> tuple[list[ExpLexerToken], str]:
return tokens, remainder


def is_single_token_expression(expression: str, token_types: Iterable[str]) -> bool:
"""
Does the expression contain single token of one of the provided token types?
"""
if not expression:
return False
tokens, _ = _TOKEN_NAME_LEXER.scan(expression.strip())
if 1 == len(tokens) and tokens[0] in token_types:
return True
else:
return False


def is_pyxform_reference(value: str) -> bool:
"""
Does the input string contain only a valid Pyxform reference? e.g. ${my_question}
"""
if not value or len(value) <= 3: # Needs 3 characters for "${}", plus a name inside.
return False
return is_single_token_expression(expression=value, token_types=("PYXFORM_REF",))
# Needs 3 characters for "${}", plus a name inside.
return value and len(value) > 3 and bool(RE_ONLY_PYXFORM_REF.fullmatch(value))


def is_xml_tag(value: str) -> bool:
"""
Does the input string contain only a valid XML tag / element name?
"""
if not value:
return False
return is_single_token_expression(expression=value, token_types=("NAME",))
return value and bool(RE_ONLY_NCNAME.fullmatch(value))


def has_last_saved(value: str) -> bool:
"""
Does the input string contain a valid '#last-saved' Pyxform reference? e.g. ${last-saved#my_question}
"""
# Needs 14 characters for "${last-saved#}", plus a name inside.
return (
value
and len(value) > 14
and "${last-saved#" in value
and RE_ANY_PYXFORM_REF.search(value)
)
34 changes: 15 additions & 19 deletions pyxform/survey.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,14 @@
from pyxform.errors import PyXFormError, ValidationError
from pyxform.external_instance import ExternalInstance
from pyxform.instance import SurveyInstance
from pyxform.parsing import instance_expression
from pyxform.parsing.expression import has_last_saved
from pyxform.parsing.instance_expression import replace_with_output
from pyxform.question import MultipleChoiceQuestion, Option, Question, Tag
from pyxform.section import SECTION_EXTRA_FIELDS, Section
from pyxform.survey_element import SURVEY_ELEMENT_FIELDS, SurveyElement
from pyxform.utils import (
BRACKETED_TAG_REGEX,
LAST_SAVED_INSTANCE_NAME,
LAST_SAVED_REGEX,
DetachableElement,
escape_text_for_xml,
has_dynamic_label,
Expand Down Expand Up @@ -568,26 +568,22 @@ def _generate_from_file_instances(element: SurveyElement) -> InstanceInfo | None
return None

@staticmethod
def _generate_last_saved_instance(element) -> bool:
def _generate_last_saved_instance(element: SurveyElement) -> bool:
"""
True if a last-saved instance should be generated, false otherwise.
"""
if not hasattr(element, "bind") or element.bind is None:
if not isinstance(element, Question):
return False
for expression_type in constants.EXTERNAL_INSTANCES:
last_saved_expression = re.search(
LAST_SAVED_REGEX, str(element["bind"].get(expression_type))
)
if last_saved_expression:
return True
return bool(
hasattr(element, constants.CHOICE_FILTER)
and element.choice_filter is not None
and re.search(LAST_SAVED_REGEX, str(element.choice_filter))
or hasattr(element, "default")
and element.default is not None
and re.search(LAST_SAVED_REGEX, str(element.default))
)
if has_last_saved(element.default):
return True
if has_last_saved(element.choice_filter):
return True
if element.bind:
# Assuming average len(bind) < 10 and len(EXTERNAL_INSTANCES) = 5 and the
# current has_last_saved implementation, iterating bind keys is fastest.
for k, v in element.bind.items():
if k in constants.EXTERNAL_INSTANCES and has_last_saved(v):
return True

@staticmethod
def _get_last_saved_instance() -> InstanceInfo:
Expand Down Expand Up @@ -1320,7 +1316,7 @@ def _var_repl_output_function(matchobj):
# need to make sure we have reason to replace
# since at this point < is &lt,
# the net effect &lt gets translated again to &amp;lt;
xml_text = instance_expression.replace_with_output(original_xml, context, self)
xml_text = replace_with_output(original_xml, context, self)
if "{" in xml_text:
xml_text = re.sub(BRACKETED_TAG_REGEX, _var_repl_output_function, xml_text)
changed = xml_text != original_xml
Expand Down
1 change: 0 additions & 1 deletion pyxform/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
INVALID_XFORM_TAG_REGEXP = re.compile(r"[^a-zA-Z:_][^a-zA-Z:_0-9\-.]*")
LAST_SAVED_INSTANCE_NAME = "__last-saved"
BRACKETED_TAG_REGEX = re.compile(r"\${(last-saved#)?(.*?)}")
LAST_SAVED_REGEX = re.compile(r"\${last-saved#(.*?)}")
PYXFORM_REFERENCE_REGEX = re.compile(r"\$\{(.*?)\}")
NODE_TYPE_TEXT = {Node.TEXT_NODE, Node.CDATA_SECTION_NODE}
XML_TEXT_SUBS = {"&": "&amp;", "<": "&lt;", ">": "&gt;"}
Expand Down
11 changes: 7 additions & 4 deletions pyxform/validators/pyxform/pyxform_reference.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,18 @@
def validate_pyxform_reference_syntax(
value: str, sheet_name: str, row_number: int, key: str
) -> None:
# Needs 3 characters for "${}" plus a name inside, but need to catch ${ for warning.
if not value or len(value) <= 2 or "${" not in value:
return
# Skip columns in potentially large sheets where references are not allowed.
if sheet_name == co.SURVEY:
if key in (co.TYPE, co.NAME):
elif sheet_name == co.SURVEY:
if key in {co.TYPE, co.NAME}:
return
elif sheet_name == co.CHOICES:
if key in (co.LIST_NAME_S, co.LIST_NAME_U, co.NAME):
if key in {co.LIST_NAME_S, co.LIST_NAME_U, co.NAME}:
return
elif sheet_name == co.ENTITIES:
if key == (co.LIST_NAME_S, co.LIST_NAME_U):
if key in {co.LIST_NAME_S, co.LIST_NAME_U}:
return

tokens, _ = parse_expression(value)
Expand Down

0 comments on commit 5979b9d

Please sign in to comment.