XLSForm · lognaturel · Sep 11, 2025 · Jul 31, 2025 · Aug 26, 2025
diff --git a/pyxform/builder.py b/pyxform/builder.py
@@ -118,15 +118,15 @@ def create_survey_element_from_dict(
 
     def _save_trigger(self, d: dict) -> None:
         if "trigger" in d:
-            triggering_ref = d["trigger"].strip()
-            value = ""
-            if const.BIND in d and "calculate" in d[const.BIND]:
-                value = d[const.BIND]["calculate"]
-            question_ref = (d[const.NAME], value)
-            if d[const.TYPE] == "background-geopoint":
-                self.setgeopoint_by_triggering_ref[triggering_ref].append(question_ref)
-            else:
-                self.setvalues_by_triggering_ref[triggering_ref].append(question_ref)
+            for trigger in d.get("trigger"):
+                value = ""
+                if const.BIND in d and "calculate" in d[const.BIND]:
+                    value = d[const.BIND]["calculate"]
+                question_ref = (d[const.NAME], value)
+                if d[const.TYPE] == "background-geopoint":
+                    self.setgeopoint_by_triggering_ref[trigger].append(question_ref)
+                else:
+                    self.setvalues_by_triggering_ref[trigger].append(question_ref)
 
     @staticmethod
     def _create_question_from_dict(

diff --git a/pyxform/errors.py b/pyxform/errors.py
@@ -2,10 +2,93 @@
 Common base classes for pyxform exceptions.
 """
 
+from enum import Enum
+from string import Formatter
+from typing import Any
+
+
+class _ErrorFormatter(Formatter):
+    """Allows specifying a default for missing format keys."""
+
+    def __init__(self, default_value: str = "unknown"):
+        self.default_value: str = default_value
+
+    def get_value(self, key, args, kwargs):
+        if isinstance(key, str):
+            return kwargs.get(key, self.default_value)
+        else:
+            return super().get_value(key, args, kwargs)
+
+
+_ERROR_FORMATTER = _ErrorFormatter()
+
+
+class _Detail:
+    """ErrorCode details."""
+
+    __slots__ = ("msg", "name")
+
+    def __init__(self, name: str, msg: str) -> None:
+        self.name: str = name
+        self.msg: str = msg
+
+    def format(self, **kwargs):
+        return _ERROR_FORMATTER.format(self.msg, **kwargs)
+
+
+class ErrorCode(Enum):
+    PYREF_001: _Detail = _Detail(
+        name="PyXForm Reference Parsing Failed",
+        msg=(
+            "[row : {row}] On the '{sheet}' sheet, the '{column}' value is invalid. "
+            "Reference variables must start with '${{', then a question name, and end with '}}'."
+        ),
+    )
+    PYREF_002: _Detail = _Detail(
+        name="PyXForm Reference Parsing Limit Reached",
+        msg=(
+            "[row : {row}] On the '{sheet}' sheet, the '{column}' value is invalid. "
+            "Reference variable lists must have a comma between each variable."
+        ),
+    )
+    PYREF_003: _Detail = _Detail(
+        name="PyXForm Reference Question Not Found",
+        msg=(
+            "[row : {row}] On the '{sheet}' sheet, the '{column}' value is invalid. "
+            "Reference variables must refer to a question name. Could not find '{q}'."
+        ),
+    )
+
 
 class PyXFormError(Exception):
     """Common base class for pyxform exceptions."""
 
+    def __init__(
+        self, *args, code: ErrorCode | None = None, context: dict[str, Any] | None = None
+    ) -> None:
+        """
+        :param args: Args for the base exception, such as a pre-formatted error message.
+        :param code: If provided, used for an error message template.
+        :param context: If provided, used to format the error message template.
+        """
+        super().__init__(*args)
+        self.code: ErrorCode | None = code
+        self.context: dict = context if context else {}
+
+    def __str__(self):
+        return self.__repr__()
+
+    def __repr__(self):
+        if self.code:
+            if self.context:
+                return self.code.value.format(**self.context)
+            else:
+                return self.code.value.name
+        elif self.args[0]:
+            return self.args[0]
+        else:
+            return super().__repr__()
+
 
 class ValidationError(PyXFormError):
     """Common base class for pyxform validation exceptions."""

diff --git a/pyxform/parsing/expression.py b/pyxform/parsing/expression.py
@@ -1,72 +1,81 @@
 import re
 from functools import lru_cache
-
-
-def get_lexer_rules():
-    # ncname regex adapted from eulxml https://github.com/emory-libraries/eulxml/blob/2e1a9f71ffd1fd455bd8326ec82125e333b352e0/eulxml/xpath/lexrules.py
-    # (C) 2010,2011 Emory University Libraries [Apache v2.0 License]
-    # They in turn adapted it from https://www.w3.org/TR/REC-xml/#NT-NameStartChar
-    # and https://www.w3.org/TR/REC-xml-names/#NT-NCName
-    namestartchar = (
-        r"([A-Z]|_|[a-z]|\xc0-\xd6]|[\xd8-\xf6]|[\xf8-\u02ff]|"
-        + r"[\u0370-\u037d]|[\u037f-\u1fff]|[\u200c-\u200d]|[\u2070-\u218f]|"
-        + r"[\u2c00-\u2fef]|[\u3001-\uD7FF]|[\uF900-\uFDCF]|[\uFDF0-\uFFFD]"
-        + r"|[\U00010000-\U000EFFFF])"
-    )
-    # additional characters allowed in NCNames after the first character
-    namechar_extra = r"[-.0-9\xb7\u0300-\u036f\u203f-\u2040]"
-    ncname_regex = (
-        r"(" + namestartchar + r")(" + namestartchar + r"|" + namechar_extra + r")*"
-    )
-    ncname_regex = ncname_regex + r"(:" + ncname_regex + r")?"
-
-    date_regex = r"-?\d{4}-\d{2}-\d{2}"
-    time_regex = r"\d{2}:\d{2}:\d{2}(\.\s+)?(((\+|\-)\d{2}:\d{2})|Z)?"
-    date_time_regex = date_regex + "T" + time_regex
-
-    # Rule order is significant - match priority runs top to bottom.
-    return {
-        # https://www.w3.org/TR/xmlschema-2/#dateTime
-        "DATETIME": date_time_regex,
-        "DATE": date_regex,
-        "TIME": time_regex,
-        "NUMBER": r"-?\d+\.\d*|-?\.\d+|-?\d+",
-        # https://www.w3.org/TR/1999/REC-xpath-19991116/#exprlex
-        "OPS_MATH": r"[\*\+\-]| mod | div ",
-        "OPS_COMP": r"\=|\!\=|\<|\>|\<=|>=",
-        "OPS_BOOL": r" and | or ",
-        "OPS_UNION": r"\|",
-        "OPEN_PAREN": r"\(",
-        "CLOSE_PAREN": r"\)",
-        "BRACKET": r"\[\]\{\}",
-        "PARENT_REF": r"\.\.",
-        "SELF_REF": r"\.",
-        "PATH_SEP": r"\/",  # javarosa.xpath says "//" is an "unsupported construct".
-        "SYSTEM_LITERAL": r""""[^"]*"|'[^']*'""",
-        "COMMA": r",",
-        "WHITESPACE": r"\s+",
-        "PYXFORM_REF": r"\$\{(last-saved#)?" + ncname_regex + r"\}",
-        "FUNC_CALL": ncname_regex + r"\(",
-        "XPATH_PRED_START": ncname_regex + r"\[",
-        "XPATH_PRED_END": r"\]",
-        "URI_SCHEME": ncname_regex + r"://",
-        "NAME": ncname_regex,  # Must be after rules containing ncname_regex.
-        "PYXFORM_REF_START": r"\$\{",
-        "PYXFORM_REF_END": r"\}",
-        "OTHER": r".+?",  # Catch any other character so that parsing doesn't stop.
-    }
-
-
-LEXER_RULES = get_lexer_rules()
-RE_ONLY_NCNAME = re.compile(rf"""^{LEXER_RULES["NAME"]}$""")
-RE_ONLY_PYXFORM_REF = re.compile(rf"""^{LEXER_RULES["PYXFORM_REF"]}$""")
-RE_ANY_PYXFORM_REF = re.compile(LEXER_RULES["PYXFORM_REF"])
+from typing import Any
+
+# ncname regex adapted from eulxml https://github.com/emory-libraries/eulxml/blob/2e1a9f71ffd1fd455bd8326ec82125e333b352e0/eulxml/xpath/lexrules.py
+# (C) 2010,2011 Emory University Libraries [Apache v2.0 License]
+# They in turn adapted it from https://www.w3.org/TR/REC-xml/#NT-NameStartChar
+# and https://www.w3.org/TR/REC-xml-names/#NT-NCName
+namestartchar = (
+    r"(?:[A-Z]|_|[a-z]|\xc0-\xd6]|[\xd8-\xf6]|[\xf8-\u02ff]|"
+    + r"[\u0370-\u037d]|[\u037f-\u1fff]|[\u200c-\u200d]|[\u2070-\u218f]|"
+    + r"[\u2c00-\u2fef]|[\u3001-\uD7FF]|[\uF900-\uFDCF]|[\uFDF0-\uFFFD]"
+    + r"|[\U00010000-\U000EFFFF])"
+)
+# additional characters allowed in NCNames after the first character
+namechar_extra = r"[-.0-9\xb7\u0300-\u036f\u203f-\u2040]"
+ncname_regex = rf"{namestartchar}(?:{namestartchar}|{namechar_extra})*"
+ncname_regex_named = rf"(?P<ncname>{ncname_regex})"
+# namespaced ncname
+ncname_regex_ns = rf"{ncname_regex}(?:\:{ncname_regex})?"
+ncname_regex_ns_named = rf"(?P<ncname_ns>{ncname_regex_ns})"
+
+date_regex = r"-?\d{4}-\d{2}-\d{2}"
+time_regex = r"\d{2}:\d{2}:\d{2}(\.\s+)?(((\+|\-)\d{2}:\d{2})|Z)?"
+date_time_regex = date_regex + "T" + time_regex
+
+# pyxform_ref_outer picks up possible refs, and matches unterminated refs to exclude them.
+pyxform_ref_outer = r"\$\{(?P<pyxform_ref>[^}]+)\}|\$\{[^}]*$"
+pyxform_ref_inner = rf"(?P<last_saved>last-saved#)?{ncname_regex_named}"
+pyxform_ref_inner_last_saved_required = (
+    rf"(?P<last_saved>last-saved#){ncname_regex_named}"
+)
+pyxform_ref = rf"(?P<pyxform_ref>\$\{{{pyxform_ref_inner}\}})"
+
+# Rule order is significant - match priority runs top to bottom.
+LEXER_RULES = {
+    # https://www.w3.org/TR/xmlschema-2/#dateTime
+    "DATETIME": date_time_regex,
+    "DATE": date_regex,
+    "TIME": time_regex,
+    "NUMBER": r"-?\d+\.\d*|-?\.\d+|-?\d+",
+    # https://www.w3.org/TR/1999/REC-xpath-19991116/#exprlex
+    "OPS_MATH": r"[\*\+\-]| mod | div ",
+    "OPS_COMP": r"\=|\!\=|\<|\>|\<=|>=",
+    "OPS_BOOL": r" and | or ",
+    "OPS_UNION": r"\|",
+    "OPEN_PAREN": r"\(",
+    "CLOSE_PAREN": r"\)",
+    "BRACKET": r"\[\]\{\}",
+    "PARENT_REF": r"\.\.",
+    "SELF_REF": r"\.",
+    "PATH_SEP": r"\/",  # javarosa.xpath says "//" is an "unsupported construct".
+    "SYSTEM_LITERAL": r""""[^"]*"|'[^']*'""",
+    "COMMA": r",",
+    "WHITESPACE": r"\s+",
+    "PYXFORM_REF": pyxform_ref,
+    "FUNC_CALL": ncname_regex_ns_named + r"\(",
+    "XPATH_PRED_START": ncname_regex_ns_named + r"\[",
+    "XPATH_PRED_END": r"\]",
+    "URI_SCHEME": ncname_regex_named + r"://",
+    "NAME": ncname_regex_named,  # Must be after rules containing ncname_regex.
+    "PYXFORM_REF_START": r"\$\{",
+    "PYXFORM_REF_END": r"\}",
+    "OTHER": r".+?",  # Catch any other character so that parsing doesn't stop.
+}
+
+
+RE_NCNAME_NAMESPACED = re.compile(ncname_regex_ns_named)
+RE_PYXFORM_REF = re.compile(pyxform_ref)
+RE_PYXFORM_REF_OUTER = re.compile(pyxform_ref_outer)
+RE_PYXFORM_REF_INNER = re.compile(pyxform_ref_inner)
 
 
 def get_expression_lexer() -> re.Scanner:
     def get_tokenizer(name):
         def tokenizer(scan, value) -> ExpLexerToken | str:
-            return ExpLexerToken(name, value, scan.match.start(), scan.match.end())
+            match = scan.match
+            return ExpLexerToken(name, value, match.start(), match.end())
 
         return tokenizer
 
@@ -104,29 +113,22 @@ def parse_expression(text: str) -> tuple[list[ExpLexerToken], str]:
     return tokens, remainder
 
 
-def is_pyxform_reference(value: str) -> bool:
-    """
-    Does the input string contain only a valid Pyxform reference? e.g. ${my_question}
-    """
-    # Needs 3 characters for "${}", plus a name inside.
-    return value and len(value) > 3 and bool(RE_ONLY_PYXFORM_REF.match(value))
-
-
 def is_xml_tag(value: str) -> bool:
     """
     Does the input string contain only a valid XML tag / element name?
     """
-    return value and bool(RE_ONLY_NCNAME.match(value))
+    return value and bool(RE_NCNAME_NAMESPACED.fullmatch(value))
 
 
-def has_last_saved(value: str) -> bool:
+def maybe_strip(value: Any) -> Any:
     """
-    Does the input string contain a valid '#last-saved' Pyxform reference? e.g. ${last-saved#my_question}
+    If the value is a string and looks like it has whitespace at either end, strip it.
+
+    If a string was "interned" (cached) by Python, string.strip() should generally return
+    the existing string if no leading/trailing whitespace was found. But strings may or
+    may not be interned by Python, and there may be a large cache for many unique values
+    (which is likely for XLSForms), so this function tries to avoid calling strip().
     """
-    # Needs 14 characters for "${last-saved#}", plus a name inside.
-    return (
-        value
-        and len(value) > 14
-        and "${last-saved#" in value
-        and RE_ANY_PYXFORM_REF.search(value)
-    )
+    if isinstance(value, str) and value and (value[0].isspace() or value[-1].isspace()):
+        return value.strip()
+    return value
diff --git a/pyxform/parsing/instance_expression.py b/pyxform/parsing/instance_expression.py
@@ -1,7 +1,7 @@
 from typing import TYPE_CHECKING
 
-from pyxform.parsing.expression import parse_expression
-from pyxform.utils import BRACKETED_TAG_REGEX, node
+from pyxform.parsing.expression import RE_PYXFORM_REF, parse_expression
+from pyxform.utils import node
 
 if TYPE_CHECKING:
     from pyxform.survey import Survey
@@ -99,7 +99,7 @@ def replace_with_output(xml_text: str, context: "SurveyElement", survey: "Survey
     :return: The possibly modified string.
     """
     # 9 = len("instance(")
-    if 9 >= len(xml_text):
+    if len(xml_text) <= 9 or "instance(" not in xml_text:
         return xml_text
     boundaries = find_boundaries(xml_text=xml_text)
     if boundaries:
@@ -108,7 +108,7 @@ def replace_with_output(xml_text: str, context: "SurveyElement", survey: "Survey
             old_str = xml_text[start:end]
             # Pass the new string through the pyxform reference replacer.
             # noinspection PyProtectedMember
-            new_str = BRACKETED_TAG_REGEX.sub(
+            new_str = RE_PYXFORM_REF.sub(
                 lambda m: survey._var_repl_function(m, context),
                 old_str,
             )