From 5979b9d4ec25df50950e4985c3d08ca36b3342bc Mon Sep 17 00:00:00 2001
From: lindsay stevens <lstevens@getodk.org>
Date: Thu, 19 Dec 2024 01:10:06 +1100
Subject: [PATCH] chg: improve efficiency of detecting expressions and
 last_saved

- expression.py
  - remove is_single_token_expression and instead look for the token
    type of interest only. This requires making the lexer rules
    available outside of get_expression_lexer() and unfortunately
    compiling an extra time since the Scanner combines all regexes into
    one before compiling (so you can't give it pre-compiled regex).
  - change PYXFORM_REF regex to specifically look for optional
    `last-saved#` prefix instead of any ncname before a `#`. Not sure
    why it was like that but `last-saved` is quicker than ncname regex.
- pyxform_reference.py
  - add some sanity checks on input `value` to avoid more expensive
    parsing in the remaining function body
- survey.py
  - in _generate_last_saved_instance, combine the check performed on
    default/choice_filter/bind(items) into a func (in expression.py)
  - refactor check of bind items to more efficient method (per comment).
---
 pyxform/parsing/expression.py                 | 60 +++++++++----------
 pyxform/survey.py                             | 34 +++++------
 pyxform/utils.py                              |  1 -
 .../validators/pyxform/pyxform_reference.py   | 11 ++--
 4 files changed, 51 insertions(+), 55 deletions(-)

diff --git a/pyxform/parsing/expression.py b/pyxform/parsing/expression.py
index 2c80b74f..45401a70 100644
--- a/pyxform/parsing/expression.py
+++ b/pyxform/parsing/expression.py
@@ -1,12 +1,8 @@
 import re
-from collections.abc import Iterable
 from functools import lru_cache
 
 
-def get_expression_lexer(name_only: bool = False) -> re.Scanner:
-    """
-    Get a expression lexer (scanner) for parsing.
-    """
+def get_lexer_rules():
     # ncname regex adapted from eulxml https://github.com/emory-libraries/eulxml/blob/2e1a9f71ffd1fd455bd8326ec82125e333b352e0/eulxml/xpath/lexrules.py
     # (C) 2010,2011 Emory University Libraries [Apache v2.0 License]
     # They in turn adapted it from https://www.w3.org/TR/REC-xml/#NT-NameStartChar
@@ -29,7 +25,7 @@ def get_expression_lexer(name_only: bool = False) -> re.Scanner:
     date_time_regex = date_regex + "T" + time_regex
 
     # Rule order is significant - match priority runs top to bottom.
-    lexer_rules = {
+    return {
         # https://www.w3.org/TR/xmlschema-2/#dateTime
         "DATETIME": date_time_regex,
         "DATE": date_regex,
@@ -49,7 +45,7 @@ def get_expression_lexer(name_only: bool = False) -> re.Scanner:
         "SYSTEM_LITERAL": r""""[^"]*"|'[^']*'""",
         "COMMA": r",",
         "WHITESPACE": r"\s+",
-        "PYXFORM_REF": r"\$\{" + ncname_regex + r"(#" + ncname_regex + r")?" + r"\}",
+        "PYXFORM_REF": r"\$\{(last-saved#)?" + ncname_regex + r"\}",
         "FUNC_CALL": ncname_regex + r"\(",
         "XPATH_PRED_START": ncname_regex + r"\[",
         "XPATH_PRED_END": r"\]",
@@ -60,15 +56,21 @@ def get_expression_lexer(name_only: bool = False) -> re.Scanner:
         "OTHER": r".+?",  # Catch any other character so that parsing doesn't stop.
     }
 
+
+LEXER_RULES = get_lexer_rules()
+RE_ONLY_NCNAME = re.compile(rf"""^{LEXER_RULES["NAME"]}$""")
+RE_ONLY_PYXFORM_REF = re.compile(rf"""^{LEXER_RULES["PYXFORM_REF"]}$""")
+RE_ANY_PYXFORM_REF = re.compile(LEXER_RULES["PYXFORM_REF"])
+
+
+def get_expression_lexer() -> re.Scanner:
     def get_tokenizer(name):
         def tokenizer(scan, value) -> ExpLexerToken | str:
-            if name_only:
-                return name
             return ExpLexerToken(name, value, scan.match.start(), scan.match.end())
 
         return tokenizer
 
-    lexicon = [(v, get_tokenizer(k)) for k, v in lexer_rules.items()]
+    lexicon = [(v, get_tokenizer(k)) for k, v in LEXER_RULES.items()]
     # re.Scanner is undocumented but has been around since at least 2003
     # https://mail.python.org/pipermail/python-dev/2003-April/035075.html
     return re.Scanner(lexicon)
@@ -84,9 +86,8 @@ def __init__(self, name: str, value: str, start: int, end: int) -> None:
         self.end: int = end
 
 
-# Scanner takes a few 100ms to compile so use these shared instances.
+# Scanner takes a few 100ms to compile so use the shared instance.
 _EXPRESSION_LEXER = get_expression_lexer()
-_TOKEN_NAME_LEXER = get_expression_lexer(name_only=True)
 
 
 @lru_cache(maxsize=128)
@@ -103,32 +104,29 @@ def parse_expression(text: str) -> tuple[list[ExpLexerToken], str]:
     return tokens, remainder
 
 
-def is_single_token_expression(expression: str, token_types: Iterable[str]) -> bool:
-    """
-    Does the expression contain single token of one of the provided token types?
-    """
-    if not expression:
-        return False
-    tokens, _ = _TOKEN_NAME_LEXER.scan(expression.strip())
-    if 1 == len(tokens) and tokens[0] in token_types:
-        return True
-    else:
-        return False
-
-
 def is_pyxform_reference(value: str) -> bool:
     """
     Does the input string contain only a valid Pyxform reference? e.g. ${my_question}
     """
-    if not value or len(value) <= 3:  # Needs 3 characters for "${}", plus a name inside.
-        return False
-    return is_single_token_expression(expression=value, token_types=("PYXFORM_REF",))
+    # Needs 3 characters for "${}", plus a name inside.
+    return value and len(value) > 3 and bool(RE_ONLY_PYXFORM_REF.fullmatch(value))
 
 
 def is_xml_tag(value: str) -> bool:
     """
     Does the input string contain only a valid XML tag / element name?
     """
-    if not value:
-        return False
-    return is_single_token_expression(expression=value, token_types=("NAME",))
+    return value and bool(RE_ONLY_NCNAME.fullmatch(value))
+
+
+def has_last_saved(value: str) -> bool:
+    """
+    Does the input string contain a valid '#last-saved' Pyxform reference? e.g. ${last-saved#my_question}
+    """
+    # Needs 14 characters for "${last-saved#}", plus a name inside.
+    return (
+        value
+        and len(value) > 14
+        and "${last-saved#" in value
+        and RE_ANY_PYXFORM_REF.search(value)
+    )
diff --git a/pyxform/survey.py b/pyxform/survey.py
index 1760d292..50ecafe9 100644
--- a/pyxform/survey.py
+++ b/pyxform/survey.py
@@ -18,14 +18,14 @@
 from pyxform.errors import PyXFormError, ValidationError
 from pyxform.external_instance import ExternalInstance
 from pyxform.instance import SurveyInstance
-from pyxform.parsing import instance_expression
+from pyxform.parsing.expression import has_last_saved
+from pyxform.parsing.instance_expression import replace_with_output
 from pyxform.question import MultipleChoiceQuestion, Option, Question, Tag
 from pyxform.section import SECTION_EXTRA_FIELDS, Section
 from pyxform.survey_element import SURVEY_ELEMENT_FIELDS, SurveyElement
 from pyxform.utils import (
     BRACKETED_TAG_REGEX,
     LAST_SAVED_INSTANCE_NAME,
-    LAST_SAVED_REGEX,
     DetachableElement,
     escape_text_for_xml,
     has_dynamic_label,
@@ -568,26 +568,22 @@ def _generate_from_file_instances(element: SurveyElement) -> InstanceInfo | None
         return None
 
     @staticmethod
-    def _generate_last_saved_instance(element) -> bool:
+    def _generate_last_saved_instance(element: SurveyElement) -> bool:
         """
         True if a last-saved instance should be generated, false otherwise.
         """
-        if not hasattr(element, "bind") or element.bind is None:
+        if not isinstance(element, Question):
             return False
-        for expression_type in constants.EXTERNAL_INSTANCES:
-            last_saved_expression = re.search(
-                LAST_SAVED_REGEX, str(element["bind"].get(expression_type))
-            )
-            if last_saved_expression:
-                return True
-        return bool(
-            hasattr(element, constants.CHOICE_FILTER)
-            and element.choice_filter is not None
-            and re.search(LAST_SAVED_REGEX, str(element.choice_filter))
-            or hasattr(element, "default")
-            and element.default is not None
-            and re.search(LAST_SAVED_REGEX, str(element.default))
-        )
+        if has_last_saved(element.default):
+            return True
+        if has_last_saved(element.choice_filter):
+            return True
+        if element.bind:
+            # Assuming average len(bind) < 10 and len(EXTERNAL_INSTANCES) = 5 and the
+            # current has_last_saved implementation, iterating bind keys is fastest.
+            for k, v in element.bind.items():
+                if k in constants.EXTERNAL_INSTANCES and has_last_saved(v):
+                    return True
 
     @staticmethod
     def _get_last_saved_instance() -> InstanceInfo:
@@ -1320,7 +1316,7 @@ def _var_repl_output_function(matchobj):
         # need to make sure we have reason to replace
         # since at this point < is &lt,
         # the net effect &lt gets translated again to &amp;lt;
-        xml_text = instance_expression.replace_with_output(original_xml, context, self)
+        xml_text = replace_with_output(original_xml, context, self)
         if "{" in xml_text:
             xml_text = re.sub(BRACKETED_TAG_REGEX, _var_repl_output_function, xml_text)
         changed = xml_text != original_xml
diff --git a/pyxform/utils.py b/pyxform/utils.py
index 1026c7a4..e42445a6 100644
--- a/pyxform/utils.py
+++ b/pyxform/utils.py
@@ -25,7 +25,6 @@
 INVALID_XFORM_TAG_REGEXP = re.compile(r"[^a-zA-Z:_][^a-zA-Z:_0-9\-.]*")
 LAST_SAVED_INSTANCE_NAME = "__last-saved"
 BRACKETED_TAG_REGEX = re.compile(r"\${(last-saved#)?(.*?)}")
-LAST_SAVED_REGEX = re.compile(r"\${last-saved#(.*?)}")
 PYXFORM_REFERENCE_REGEX = re.compile(r"\$\{(.*?)\}")
 NODE_TYPE_TEXT = {Node.TEXT_NODE, Node.CDATA_SECTION_NODE}
 XML_TEXT_SUBS = {"&": "&amp;", "<": "&lt;", ">": "&gt;"}
diff --git a/pyxform/validators/pyxform/pyxform_reference.py b/pyxform/validators/pyxform/pyxform_reference.py
index e55a408a..a1b02783 100644
--- a/pyxform/validators/pyxform/pyxform_reference.py
+++ b/pyxform/validators/pyxform/pyxform_reference.py
@@ -11,15 +11,18 @@
 def validate_pyxform_reference_syntax(
     value: str, sheet_name: str, row_number: int, key: str
 ) -> None:
+    # Needs 3 characters for "${}" plus a name inside, but need to catch ${ for warning.
+    if not value or len(value) <= 2 or "${" not in value:
+        return
     # Skip columns in potentially large sheets where references are not allowed.
-    if sheet_name == co.SURVEY:
-        if key in (co.TYPE, co.NAME):
+    elif sheet_name == co.SURVEY:
+        if key in {co.TYPE, co.NAME}:
             return
     elif sheet_name == co.CHOICES:
-        if key in (co.LIST_NAME_S, co.LIST_NAME_U, co.NAME):
+        if key in {co.LIST_NAME_S, co.LIST_NAME_U, co.NAME}:
             return
     elif sheet_name == co.ENTITIES:
-        if key == (co.LIST_NAME_S, co.LIST_NAME_U):
+        if key in {co.LIST_NAME_S, co.LIST_NAME_U}:
             return
 
     tokens, _ = parse_expression(value)