sillsdev · Enkidu93 · Oct 6, 2025 · Oct 2, 2025 · Oct 2, 2025 · Oct 2, 2025
diff --git a/machine/corpora/paratext_project_terms_parser_base.py b/machine/corpora/paratext_project_terms_parser_base.py
@@ -70,10 +70,14 @@ def parse(self, term_categories: Sequence[str], use_term_glosses: bool = True) -
                 id = term.attrib["Id"]
                 if _is_in_category(id, term_categories, term_id_to_category_dict):
                     id_ = id.replace("\n", "&#xA")
-                    renderings = term.find("Renderings")
-                    gloss = renderings.text if renderings is not None and renderings.text is not None else ""
-                    glosses = _get_glosses(gloss)
-                    terms_renderings[id_].extend(glosses)
+                    renderings_element = term.find("Renderings")
+                    rendering_text = (
+                        renderings_element.text
+                        if renderings_element is not None and renderings_element.text is not None
+                        else ""
+                    )
+                    renderings = _get_renderings(rendering_text)
+                    terms_renderings[id_].extend(renderings)
 
         terms_glosses: Dict[str, List[str]] = defaultdict(list)
         if terms_glosses_doc is not None and use_term_glosses:
@@ -102,25 +106,33 @@ def _is_in_category(id: str, term_categories: Sequence[str], term_id_to_category
     return not term_categories or (category is not None and category in term_categories)
 
 
+def _clean_term(term: str):
+    term = term.strip()
+    term = _strip_parens(term)
+    term = " ".join(term.split())
+    return term
+
+
 def _get_glosses(gloss: str) -> List[str]:
     match = _CONTENT_IN_BRACKETS_REGEX.match(gloss)
     if match:
-        gloss = match.group(0)
-    gloss = gloss.replace("?", "")
-    gloss = gloss.replace("*", "")
-    gloss = gloss.replace("/", " ")
-    gloss = gloss.strip()
-    gloss = _strip_parens(gloss)
+        gloss = match.group(1)
+    gloss = _clean_term(gloss)
     gloss = _strip_parens(gloss, left="[", right="]")
     gloss = gloss.strip()
     for match in _NUMERICAL_INFORMATION_REGEX.finditer(gloss):
         gloss = gloss.replace(match.group(0), "")
-    glosses = re.split(r"\|\|", gloss)
-    glosses = [re.split(r"[,;]", g) for g in glosses]
-    glosses = [item.strip() for sublist in glosses for item in sublist if item.strip()]
+    glosses = re.split(r"[,;/]", gloss)
+    glosses = list(set([gloss.strip() for gloss in glosses if gloss.strip()]))
     return glosses
 
 
+def _get_renderings(rendering: str) -> List[str]:
+    renderings = re.split(r"\|\|", rendering.strip())
+    renderings = [_clean_term(rendering).strip().replace("*", "") for rendering in renderings]
+    return [rendering for rendering in renderings if rendering]
+
+
 def _strip_parens(term_string: str, left: str = "(", right: str = ")") -> str:
     parens: int = 0
     end: int = -1

diff --git a/machine/corpora/paratext_project_text_updater_base.py b/machine/corpora/paratext_project_text_updater_base.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import BinaryIO, Iterable, Optional, Sequence, Union
+from typing import BinaryIO, Callable, Iterable, Optional, Sequence, Union
 
 from ..utils.typeshed import StrPath
 from .paratext_project_settings import ParatextProjectSettings
@@ -11,7 +11,7 @@
     UpdateUsfmTextBehavior,
 )
 from .usfm_parser import parse_usfm
-from .usfm_update_block_handler import UsfmUpdateBlockHandler
+from .usfm_update_block_handler import UsfmUpdateBlockHandler, UsfmUpdateBlockHandlerError
 
 
 class ParatextProjectTextUpdaterBase(ABC):
@@ -33,6 +33,8 @@ def update_usfm(
         preserve_paragraph_styles: Optional[Union[Iterable[str], str]] = None,
         update_block_handlers: Optional[Iterable[UsfmUpdateBlockHandler]] = None,
         remarks: Optional[Iterable[str]] = None,
+        error_handler: Optional[Callable[[UsfmUpdateBlockHandlerError], bool]] = None,
+        compare_segments: bool = False,
     ) -> Optional[str]:
         file_name: str = self._settings.get_book_file_name(book_id)
         if not self._exists(file_name):
@@ -49,6 +51,8 @@ def update_usfm(
             preserve_paragraph_styles,
             update_block_handlers=update_block_handlers,
             remarks=remarks,
+            error_handler=error_handler,
+            compare_segments=compare_segments,
         )
         try:
             parse_usfm(usfm, handler, self._settings.stylesheet, self._settings.versification)

diff --git a/machine/corpora/place_markers_usfm_update_block_handler.py b/machine/corpora/place_markers_usfm_update_block_handler.py
@@ -7,7 +7,7 @@
 from .usfm_token import UsfmToken, UsfmTokenType
 from .usfm_update_block import UsfmUpdateBlock
 from .usfm_update_block_element import UsfmUpdateBlockElement, UsfmUpdateBlockElementType
-from .usfm_update_block_handler import UsfmUpdateBlockHandler
+from .usfm_update_block_handler import UsfmUpdateBlockHandler, UsfmUpdateBlockHandlerError
 
 PLACE_MARKERS_ALIGNMENT_INFO_KEY = "alignment_info"
 
@@ -118,7 +118,18 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock:
         trg_tok_starts = []
         prev_len = 0
         for tok in trg_toks:
-            trg_tok_starts.append(trg_sent.index(tok, trg_tok_starts[-1] + prev_len if len(trg_tok_starts) > 0 else 0))
+            try:
+                index_of_trg_tok_in_sent = trg_sent.index(
+                    tok, trg_tok_starts[-1] + prev_len if len(trg_tok_starts) > 0 else 0
+                )
+            except ValueError:
+                raise UsfmUpdateBlockHandlerError(
+                    block,
+                    f'No token "{tok}" found in text "{trg_sent}" at or beyond index'
+                    f"{trg_tok_starts[-1] + prev_len if len(trg_tok_starts) > 0 else 0}."
+                    "Is the versification correctly specified?",
+                )
+            trg_tok_starts.append(index_of_trg_tok_in_sent)
             prev_len = len(tok)
 
         # Predict marker placements and get insertion order

diff --git a/machine/corpora/scripture_ref.py b/machine/corpora/scripture_ref.py
@@ -4,7 +4,7 @@
 from typing import List, Optional
 
 from ..scripture.constants import ENGLISH_VERSIFICATION
-from ..scripture.verse_ref import VerseRef, Versification
+from ..scripture.verse_ref import IgnoreSegmentsVerseRef, VerseRef, Versification
 from ..utils.comparable import Comparable
 from .scripture_element import ScriptureElement
 
@@ -112,15 +112,17 @@ def compare_to(self, other: object, compare_segments: bool = True) -> int:
     def __eq__(self, other: object) -> bool:
         if not isinstance(other, ScriptureRef):
             return NotImplemented
-        return self.verse_ref == other.verse_ref and self.path == other.path
+        return self.compare_to(other, True) == 0
 
     def __lt__(self, other: object) -> bool:
         if not isinstance(other, ScriptureRef):
             return NotImplemented
         return self.compare_to(other) < 0
 
     def __hash__(self) -> int:
-        return hash((self.verse_ref, tuple(self.path)))
+        # Using to_relaxed() is necessary to maintain equality across relaxed refs,
+        # __eq__ properly handles relaxed ref comparison
+        return hash((self.verse_ref, tuple(self.to_relaxed().path)))
 
     def __repr__(self) -> str:
         result = str(self.verse_ref)
@@ -129,4 +131,19 @@ def __repr__(self) -> str:
         return result
 
 
+class IgnoreSegmentsScriptureRef(ScriptureRef):
+    def __eq__(self, other):
+        if not isinstance(other, ScriptureRef):
+            return NotImplemented
+        return self.compare_to(other, False)
+
+    def __hash__(self):
+        return hash(
+            (
+                IgnoreSegmentsVerseRef(self.verse_ref),
+                tuple(self.to_relaxed().path),
+            )
+        )
+
+
 EMPTY_SCRIPTURE_REF = ScriptureRef()