Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 9 additions & 9 deletions pyxform/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,15 +118,15 @@ def create_survey_element_from_dict(

def _save_trigger(self, d: dict) -> None:
if "trigger" in d:
triggering_ref = d["trigger"].strip()
value = ""
if const.BIND in d and "calculate" in d[const.BIND]:
value = d[const.BIND]["calculate"]
question_ref = (d[const.NAME], value)
if d[const.TYPE] == "background-geopoint":
self.setgeopoint_by_triggering_ref[triggering_ref].append(question_ref)
else:
self.setvalues_by_triggering_ref[triggering_ref].append(question_ref)
for trigger in d.get("trigger"):
value = ""
if const.BIND in d and "calculate" in d[const.BIND]:
value = d[const.BIND]["calculate"]
question_ref = (d[const.NAME], value)
if d[const.TYPE] == "background-geopoint":
self.setgeopoint_by_triggering_ref[trigger].append(question_ref)
else:
self.setvalues_by_triggering_ref[trigger].append(question_ref)

@staticmethod
def _create_question_from_dict(
Expand Down
83 changes: 83 additions & 0 deletions pyxform/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,93 @@
Common base classes for pyxform exceptions.
"""

from enum import Enum
from string import Formatter
from typing import Any


class _ErrorFormatter(Formatter):
"""Allows specifying a default for missing format keys."""

def __init__(self, default_value: str = "unknown"):
self.default_value: str = default_value

def get_value(self, key, args, kwargs):
if isinstance(key, str):
return kwargs.get(key, self.default_value)
else:
return super().get_value(key, args, kwargs)


_ERROR_FORMATTER = _ErrorFormatter()


class _Detail:
"""ErrorCode details."""

__slots__ = ("msg", "name")

def __init__(self, name: str, msg: str) -> None:
self.name: str = name
self.msg: str = msg

def format(self, **kwargs):
return _ERROR_FORMATTER.format(self.msg, **kwargs)


class ErrorCode(Enum):
PYREF_001: _Detail = _Detail(
name="PyXForm Reference Parsing Failed",
msg=(
"[row : {row}] On the '{sheet}' sheet, the '{column}' value is invalid. "
"Reference variables must start with '${{', then a question name, and end with '}}'."
),
)
PYREF_002: _Detail = _Detail(
name="PyXForm Reference Parsing Limit Reached",
msg=(
"[row : {row}] On the '{sheet}' sheet, the '{column}' value is invalid. "
"Reference variable lists must have a comma between each variable."
),
)
PYREF_003: _Detail = _Detail(
name="PyXForm Reference Question Not Found",
msg=(
"[row : {row}] On the '{sheet}' sheet, the '{column}' value is invalid. "
"Reference variables must refer to a question name. Could not find '{q}'."
),
)


class PyXFormError(Exception):
"""Common base class for pyxform exceptions."""

def __init__(
self, *args, code: ErrorCode | None = None, context: dict[str, Any] | None = None
) -> None:
"""
:param args: Args for the base exception, such as a pre-formatted error message.
:param code: If provided, used for an error message template.
:param context: If provided, used to format the error message template.
"""
super().__init__(*args)
self.code: ErrorCode | None = code
self.context: dict = context if context else {}

def __str__(self):
return self.__repr__()

def __repr__(self):
if self.code:
if self.context:
return self.code.value.format(**self.context)
else:
return self.code.value.name
elif self.args[0]:
return self.args[0]
else:
return super().__repr__()


class ValidationError(PyXFormError):
"""Common base class for pyxform validation exceptions."""
Expand Down
162 changes: 82 additions & 80 deletions pyxform/parsing/expression.py
Original file line number Diff line number Diff line change
@@ -1,72 +1,81 @@
import re
from functools import lru_cache


def get_lexer_rules():
# ncname regex adapted from eulxml https://github.com/emory-libraries/eulxml/blob/2e1a9f71ffd1fd455bd8326ec82125e333b352e0/eulxml/xpath/lexrules.py
# (C) 2010,2011 Emory University Libraries [Apache v2.0 License]
# They in turn adapted it from https://www.w3.org/TR/REC-xml/#NT-NameStartChar
# and https://www.w3.org/TR/REC-xml-names/#NT-NCName
namestartchar = (
r"([A-Z]|_|[a-z]|\xc0-\xd6]|[\xd8-\xf6]|[\xf8-\u02ff]|"
+ r"[\u0370-\u037d]|[\u037f-\u1fff]|[\u200c-\u200d]|[\u2070-\u218f]|"
+ r"[\u2c00-\u2fef]|[\u3001-\uD7FF]|[\uF900-\uFDCF]|[\uFDF0-\uFFFD]"
+ r"|[\U00010000-\U000EFFFF])"
)
# additional characters allowed in NCNames after the first character
namechar_extra = r"[-.0-9\xb7\u0300-\u036f\u203f-\u2040]"
ncname_regex = (
r"(" + namestartchar + r")(" + namestartchar + r"|" + namechar_extra + r")*"
)
ncname_regex = ncname_regex + r"(:" + ncname_regex + r")?"

date_regex = r"-?\d{4}-\d{2}-\d{2}"
time_regex = r"\d{2}:\d{2}:\d{2}(\.\s+)?(((\+|\-)\d{2}:\d{2})|Z)?"
date_time_regex = date_regex + "T" + time_regex

# Rule order is significant - match priority runs top to bottom.
return {
# https://www.w3.org/TR/xmlschema-2/#dateTime
"DATETIME": date_time_regex,
"DATE": date_regex,
"TIME": time_regex,
"NUMBER": r"-?\d+\.\d*|-?\.\d+|-?\d+",
# https://www.w3.org/TR/1999/REC-xpath-19991116/#exprlex
"OPS_MATH": r"[\*\+\-]| mod | div ",
"OPS_COMP": r"\=|\!\=|\<|\>|\<=|>=",
"OPS_BOOL": r" and | or ",
"OPS_UNION": r"\|",
"OPEN_PAREN": r"\(",
"CLOSE_PAREN": r"\)",
"BRACKET": r"\[\]\{\}",
"PARENT_REF": r"\.\.",
"SELF_REF": r"\.",
"PATH_SEP": r"\/", # javarosa.xpath says "//" is an "unsupported construct".
"SYSTEM_LITERAL": r""""[^"]*"|'[^']*'""",
"COMMA": r",",
"WHITESPACE": r"\s+",
"PYXFORM_REF": r"\$\{(last-saved#)?" + ncname_regex + r"\}",
"FUNC_CALL": ncname_regex + r"\(",
"XPATH_PRED_START": ncname_regex + r"\[",
"XPATH_PRED_END": r"\]",
"URI_SCHEME": ncname_regex + r"://",
"NAME": ncname_regex, # Must be after rules containing ncname_regex.
"PYXFORM_REF_START": r"\$\{",
"PYXFORM_REF_END": r"\}",
"OTHER": r".+?", # Catch any other character so that parsing doesn't stop.
}


LEXER_RULES = get_lexer_rules()
RE_ONLY_NCNAME = re.compile(rf"""^{LEXER_RULES["NAME"]}$""")
RE_ONLY_PYXFORM_REF = re.compile(rf"""^{LEXER_RULES["PYXFORM_REF"]}$""")
RE_ANY_PYXFORM_REF = re.compile(LEXER_RULES["PYXFORM_REF"])
from typing import Any

# ncname regex adapted from eulxml https://github.com/emory-libraries/eulxml/blob/2e1a9f71ffd1fd455bd8326ec82125e333b352e0/eulxml/xpath/lexrules.py
# (C) 2010,2011 Emory University Libraries [Apache v2.0 License]
# They in turn adapted it from https://www.w3.org/TR/REC-xml/#NT-NameStartChar
# and https://www.w3.org/TR/REC-xml-names/#NT-NCName
namestartchar = (
r"(?:[A-Z]|_|[a-z]|\xc0-\xd6]|[\xd8-\xf6]|[\xf8-\u02ff]|"
+ r"[\u0370-\u037d]|[\u037f-\u1fff]|[\u200c-\u200d]|[\u2070-\u218f]|"
+ r"[\u2c00-\u2fef]|[\u3001-\uD7FF]|[\uF900-\uFDCF]|[\uFDF0-\uFFFD]"
+ r"|[\U00010000-\U000EFFFF])"
)
# additional characters allowed in NCNames after the first character
namechar_extra = r"[-.0-9\xb7\u0300-\u036f\u203f-\u2040]"
ncname_regex = rf"{namestartchar}(?:{namestartchar}|{namechar_extra})*"
ncname_regex_named = rf"(?P<ncname>{ncname_regex})"
# namespaced ncname
ncname_regex_ns = rf"{ncname_regex}(?:\:{ncname_regex})?"
ncname_regex_ns_named = rf"(?P<ncname_ns>{ncname_regex_ns})"

date_regex = r"-?\d{4}-\d{2}-\d{2}"
time_regex = r"\d{2}:\d{2}:\d{2}(\.\s+)?(((\+|\-)\d{2}:\d{2})|Z)?"
date_time_regex = date_regex + "T" + time_regex

# pyxform_ref_outer picks up possible refs, and matches unterminated refs to exclude them.
pyxform_ref_outer = r"\$\{(?P<pyxform_ref>[^}]+)\}|\$\{[^}]*$"
pyxform_ref_inner = rf"(?P<last_saved>last-saved#)?{ncname_regex_named}"
pyxform_ref_inner_last_saved_required = (
rf"(?P<last_saved>last-saved#){ncname_regex_named}"
)
pyxform_ref = rf"(?P<pyxform_ref>\$\{{{pyxform_ref_inner}\}})"

# Rule order is significant - match priority runs top to bottom.
LEXER_RULES = {
# https://www.w3.org/TR/xmlschema-2/#dateTime
"DATETIME": date_time_regex,
"DATE": date_regex,
"TIME": time_regex,
"NUMBER": r"-?\d+\.\d*|-?\.\d+|-?\d+",
# https://www.w3.org/TR/1999/REC-xpath-19991116/#exprlex
"OPS_MATH": r"[\*\+\-]| mod | div ",
"OPS_COMP": r"\=|\!\=|\<|\>|\<=|>=",
"OPS_BOOL": r" and | or ",
"OPS_UNION": r"\|",
"OPEN_PAREN": r"\(",
"CLOSE_PAREN": r"\)",
"BRACKET": r"\[\]\{\}",
"PARENT_REF": r"\.\.",
"SELF_REF": r"\.",
"PATH_SEP": r"\/", # javarosa.xpath says "//" is an "unsupported construct".
"SYSTEM_LITERAL": r""""[^"]*"|'[^']*'""",
"COMMA": r",",
"WHITESPACE": r"\s+",
"PYXFORM_REF": pyxform_ref,
"FUNC_CALL": ncname_regex_ns_named + r"\(",
"XPATH_PRED_START": ncname_regex_ns_named + r"\[",
"XPATH_PRED_END": r"\]",
"URI_SCHEME": ncname_regex_named + r"://",
"NAME": ncname_regex_named, # Must be after rules containing ncname_regex.
"PYXFORM_REF_START": r"\$\{",
"PYXFORM_REF_END": r"\}",
"OTHER": r".+?", # Catch any other character so that parsing doesn't stop.
}


RE_NCNAME_NAMESPACED = re.compile(ncname_regex_ns_named)
RE_PYXFORM_REF = re.compile(pyxform_ref)
RE_PYXFORM_REF_OUTER = re.compile(pyxform_ref_outer)
RE_PYXFORM_REF_INNER = re.compile(pyxform_ref_inner)


def get_expression_lexer() -> re.Scanner:
def get_tokenizer(name):
def tokenizer(scan, value) -> ExpLexerToken | str:
return ExpLexerToken(name, value, scan.match.start(), scan.match.end())
match = scan.match
return ExpLexerToken(name, value, match.start(), match.end())

return tokenizer

Expand Down Expand Up @@ -104,29 +113,22 @@ def parse_expression(text: str) -> tuple[list[ExpLexerToken], str]:
return tokens, remainder


def is_pyxform_reference(value: str) -> bool:
"""
Does the input string contain only a valid Pyxform reference? e.g. ${my_question}
"""
# Needs 3 characters for "${}", plus a name inside.
return value and len(value) > 3 and bool(RE_ONLY_PYXFORM_REF.match(value))


def is_xml_tag(value: str) -> bool:
"""
Does the input string contain only a valid XML tag / element name?
"""
return value and bool(RE_ONLY_NCNAME.match(value))
return value and bool(RE_NCNAME_NAMESPACED.fullmatch(value))


def has_last_saved(value: str) -> bool:
def maybe_strip(value: Any) -> Any:
"""
Does the input string contain a valid '#last-saved' Pyxform reference? e.g. ${last-saved#my_question}
If the value is a string and looks like it has whitespace at either end, strip it.

If a string was "interned" (cached) by Python, string.strip() should generally return
the existing string if no leading/trailing whitespace was found. But strings may or
may not be interned by Python, and there may be a large cache for many unique values
(which is likely for XLSForms), so this function tries to avoid calling strip().
"""
# Needs 14 characters for "${last-saved#}", plus a name inside.
return (
value
and len(value) > 14
and "${last-saved#" in value
and RE_ANY_PYXFORM_REF.search(value)
)
if isinstance(value, str) and value and (value[0].isspace() or value[-1].isspace()):
return value.strip()
return value
8 changes: 4 additions & 4 deletions pyxform/parsing/instance_expression.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import TYPE_CHECKING

from pyxform.parsing.expression import parse_expression
from pyxform.utils import BRACKETED_TAG_REGEX, node
from pyxform.parsing.expression import RE_PYXFORM_REF, parse_expression
from pyxform.utils import node

if TYPE_CHECKING:
from pyxform.survey import Survey
Expand Down Expand Up @@ -99,7 +99,7 @@ def replace_with_output(xml_text: str, context: "SurveyElement", survey: "Survey
:return: The possibly modified string.
"""
# 9 = len("instance(")
if 9 >= len(xml_text):
if len(xml_text) <= 9 or "instance(" not in xml_text:
return xml_text
boundaries = find_boundaries(xml_text=xml_text)
if boundaries:
Expand All @@ -108,7 +108,7 @@ def replace_with_output(xml_text: str, context: "SurveyElement", survey: "Survey
old_str = xml_text[start:end]
# Pass the new string through the pyxform reference replacer.
# noinspection PyProtectedMember
new_str = BRACKETED_TAG_REGEX.sub(
new_str = RE_PYXFORM_REF.sub(
lambda m: survey._var_repl_function(m, context),
old_str,
)
Expand Down
Loading
Loading