From 5979b9d4ec25df50950e4985c3d08ca36b3342bc Mon Sep 17 00:00:00 2001 From: lindsay stevens Date: Thu, 19 Dec 2024 01:10:06 +1100 Subject: [PATCH] chg: improve efficiency of detecting expressions and last_saved - expression.py - remove is_single_token_expression and instead look for the token type of interest only. This requires making the lexer rules available outside of get_expression_lexer() and unfortunately compiling an extra time since the Scanner combines all regexes into one before compiling (so you can't give it pre-compiled regex). - change PYXFORM_REF regex to specifically look for optional `last-saved#` prefix instead of any ncname before a `#`. Not sure why it was like that but `last-saved` is quicker than ncname regex. - pyxform_reference.py - add some sanity checks on input `value` to avoid more expensive parsing in the remaining function body - survey.py - in _generate_last_saved_instance, combine the check performed on default/choice_filter/bind(items) into a func (in expression.py) - refactor check of bind items to more efficient method (per comment). --- pyxform/parsing/expression.py | 60 +++++++++---------- pyxform/survey.py | 34 +++++------ pyxform/utils.py | 1 - .../validators/pyxform/pyxform_reference.py | 11 ++-- 4 files changed, 51 insertions(+), 55 deletions(-) diff --git a/pyxform/parsing/expression.py b/pyxform/parsing/expression.py index 2c80b74f..45401a70 100644 --- a/pyxform/parsing/expression.py +++ b/pyxform/parsing/expression.py @@ -1,12 +1,8 @@ import re -from collections.abc import Iterable from functools import lru_cache -def get_expression_lexer(name_only: bool = False) -> re.Scanner: - """ - Get a expression lexer (scanner) for parsing. - """ +def get_lexer_rules(): # ncname regex adapted from eulxml https://github.com/emory-libraries/eulxml/blob/2e1a9f71ffd1fd455bd8326ec82125e333b352e0/eulxml/xpath/lexrules.py # (C) 2010,2011 Emory University Libraries [Apache v2.0 License] # They in turn adapted it from https://www.w3.org/TR/REC-xml/#NT-NameStartChar @@ -29,7 +25,7 @@ def get_expression_lexer(name_only: bool = False) -> re.Scanner: date_time_regex = date_regex + "T" + time_regex # Rule order is significant - match priority runs top to bottom. - lexer_rules = { + return { # https://www.w3.org/TR/xmlschema-2/#dateTime "DATETIME": date_time_regex, "DATE": date_regex, @@ -49,7 +45,7 @@ def get_expression_lexer(name_only: bool = False) -> re.Scanner: "SYSTEM_LITERAL": r""""[^"]*"|'[^']*'""", "COMMA": r",", "WHITESPACE": r"\s+", - "PYXFORM_REF": r"\$\{" + ncname_regex + r"(#" + ncname_regex + r")?" + r"\}", + "PYXFORM_REF": r"\$\{(last-saved#)?" + ncname_regex + r"\}", "FUNC_CALL": ncname_regex + r"\(", "XPATH_PRED_START": ncname_regex + r"\[", "XPATH_PRED_END": r"\]", @@ -60,15 +56,21 @@ def get_expression_lexer(name_only: bool = False) -> re.Scanner: "OTHER": r".+?", # Catch any other character so that parsing doesn't stop. } + +LEXER_RULES = get_lexer_rules() +RE_ONLY_NCNAME = re.compile(rf"""^{LEXER_RULES["NAME"]}$""") +RE_ONLY_PYXFORM_REF = re.compile(rf"""^{LEXER_RULES["PYXFORM_REF"]}$""") +RE_ANY_PYXFORM_REF = re.compile(LEXER_RULES["PYXFORM_REF"]) + + +def get_expression_lexer() -> re.Scanner: def get_tokenizer(name): def tokenizer(scan, value) -> ExpLexerToken | str: - if name_only: - return name return ExpLexerToken(name, value, scan.match.start(), scan.match.end()) return tokenizer - lexicon = [(v, get_tokenizer(k)) for k, v in lexer_rules.items()] + lexicon = [(v, get_tokenizer(k)) for k, v in LEXER_RULES.items()] # re.Scanner is undocumented but has been around since at least 2003 # https://mail.python.org/pipermail/python-dev/2003-April/035075.html return re.Scanner(lexicon) @@ -84,9 +86,8 @@ def __init__(self, name: str, value: str, start: int, end: int) -> None: self.end: int = end -# Scanner takes a few 100ms to compile so use these shared instances. +# Scanner takes a few 100ms to compile so use the shared instance. _EXPRESSION_LEXER = get_expression_lexer() -_TOKEN_NAME_LEXER = get_expression_lexer(name_only=True) @lru_cache(maxsize=128) @@ -103,32 +104,29 @@ def parse_expression(text: str) -> tuple[list[ExpLexerToken], str]: return tokens, remainder -def is_single_token_expression(expression: str, token_types: Iterable[str]) -> bool: - """ - Does the expression contain single token of one of the provided token types? - """ - if not expression: - return False - tokens, _ = _TOKEN_NAME_LEXER.scan(expression.strip()) - if 1 == len(tokens) and tokens[0] in token_types: - return True - else: - return False - - def is_pyxform_reference(value: str) -> bool: """ Does the input string contain only a valid Pyxform reference? e.g. ${my_question} """ - if not value or len(value) <= 3: # Needs 3 characters for "${}", plus a name inside. - return False - return is_single_token_expression(expression=value, token_types=("PYXFORM_REF",)) + # Needs 3 characters for "${}", plus a name inside. + return value and len(value) > 3 and bool(RE_ONLY_PYXFORM_REF.fullmatch(value)) def is_xml_tag(value: str) -> bool: """ Does the input string contain only a valid XML tag / element name? """ - if not value: - return False - return is_single_token_expression(expression=value, token_types=("NAME",)) + return value and bool(RE_ONLY_NCNAME.fullmatch(value)) + + +def has_last_saved(value: str) -> bool: + """ + Does the input string contain a valid '#last-saved' Pyxform reference? e.g. ${last-saved#my_question} + """ + # Needs 14 characters for "${last-saved#}", plus a name inside. + return ( + value + and len(value) > 14 + and "${last-saved#" in value + and RE_ANY_PYXFORM_REF.search(value) + ) diff --git a/pyxform/survey.py b/pyxform/survey.py index 1760d292..50ecafe9 100644 --- a/pyxform/survey.py +++ b/pyxform/survey.py @@ -18,14 +18,14 @@ from pyxform.errors import PyXFormError, ValidationError from pyxform.external_instance import ExternalInstance from pyxform.instance import SurveyInstance -from pyxform.parsing import instance_expression +from pyxform.parsing.expression import has_last_saved +from pyxform.parsing.instance_expression import replace_with_output from pyxform.question import MultipleChoiceQuestion, Option, Question, Tag from pyxform.section import SECTION_EXTRA_FIELDS, Section from pyxform.survey_element import SURVEY_ELEMENT_FIELDS, SurveyElement from pyxform.utils import ( BRACKETED_TAG_REGEX, LAST_SAVED_INSTANCE_NAME, - LAST_SAVED_REGEX, DetachableElement, escape_text_for_xml, has_dynamic_label, @@ -568,26 +568,22 @@ def _generate_from_file_instances(element: SurveyElement) -> InstanceInfo | None return None @staticmethod - def _generate_last_saved_instance(element) -> bool: + def _generate_last_saved_instance(element: SurveyElement) -> bool: """ True if a last-saved instance should be generated, false otherwise. """ - if not hasattr(element, "bind") or element.bind is None: + if not isinstance(element, Question): return False - for expression_type in constants.EXTERNAL_INSTANCES: - last_saved_expression = re.search( - LAST_SAVED_REGEX, str(element["bind"].get(expression_type)) - ) - if last_saved_expression: - return True - return bool( - hasattr(element, constants.CHOICE_FILTER) - and element.choice_filter is not None - and re.search(LAST_SAVED_REGEX, str(element.choice_filter)) - or hasattr(element, "default") - and element.default is not None - and re.search(LAST_SAVED_REGEX, str(element.default)) - ) + if has_last_saved(element.default): + return True + if has_last_saved(element.choice_filter): + return True + if element.bind: + # Assuming average len(bind) < 10 and len(EXTERNAL_INSTANCES) = 5 and the + # current has_last_saved implementation, iterating bind keys is fastest. + for k, v in element.bind.items(): + if k in constants.EXTERNAL_INSTANCES and has_last_saved(v): + return True @staticmethod def _get_last_saved_instance() -> InstanceInfo: @@ -1320,7 +1316,7 @@ def _var_repl_output_function(matchobj): # need to make sure we have reason to replace # since at this point < is <, # the net effect < gets translated again to &lt; - xml_text = instance_expression.replace_with_output(original_xml, context, self) + xml_text = replace_with_output(original_xml, context, self) if "{" in xml_text: xml_text = re.sub(BRACKETED_TAG_REGEX, _var_repl_output_function, xml_text) changed = xml_text != original_xml diff --git a/pyxform/utils.py b/pyxform/utils.py index 1026c7a4..e42445a6 100644 --- a/pyxform/utils.py +++ b/pyxform/utils.py @@ -25,7 +25,6 @@ INVALID_XFORM_TAG_REGEXP = re.compile(r"[^a-zA-Z:_][^a-zA-Z:_0-9\-.]*") LAST_SAVED_INSTANCE_NAME = "__last-saved" BRACKETED_TAG_REGEX = re.compile(r"\${(last-saved#)?(.*?)}") -LAST_SAVED_REGEX = re.compile(r"\${last-saved#(.*?)}") PYXFORM_REFERENCE_REGEX = re.compile(r"\$\{(.*?)\}") NODE_TYPE_TEXT = {Node.TEXT_NODE, Node.CDATA_SECTION_NODE} XML_TEXT_SUBS = {"&": "&", "<": "<", ">": ">"} diff --git a/pyxform/validators/pyxform/pyxform_reference.py b/pyxform/validators/pyxform/pyxform_reference.py index e55a408a..a1b02783 100644 --- a/pyxform/validators/pyxform/pyxform_reference.py +++ b/pyxform/validators/pyxform/pyxform_reference.py @@ -11,15 +11,18 @@ def validate_pyxform_reference_syntax( value: str, sheet_name: str, row_number: int, key: str ) -> None: + # Needs 3 characters for "${}" plus a name inside, but need to catch ${ for warning. + if not value or len(value) <= 2 or "${" not in value: + return # Skip columns in potentially large sheets where references are not allowed. - if sheet_name == co.SURVEY: - if key in (co.TYPE, co.NAME): + elif sheet_name == co.SURVEY: + if key in {co.TYPE, co.NAME}: return elif sheet_name == co.CHOICES: - if key in (co.LIST_NAME_S, co.LIST_NAME_U, co.NAME): + if key in {co.LIST_NAME_S, co.LIST_NAME_U, co.NAME}: return elif sheet_name == co.ENTITIES: - if key == (co.LIST_NAME_S, co.LIST_NAME_U): + if key in {co.LIST_NAME_S, co.LIST_NAME_U}: return tokens, _ = parse_expression(value)