Skip to content

Commit 267a1ee

Browse files
committed
1 parent 7021586 commit 267a1ee

10 files changed

+319
-66
lines changed

machine/corpora/paratext_project_terms_parser_base.py

Lines changed: 25 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -70,10 +70,14 @@ def parse(self, term_categories: Sequence[str], use_term_glosses: bool = True) -
7070
id = term.attrib["Id"]
7171
if _is_in_category(id, term_categories, term_id_to_category_dict):
7272
id_ = id.replace("\n", "&#xA")
73-
renderings = term.find("Renderings")
74-
gloss = renderings.text if renderings is not None and renderings.text is not None else ""
75-
glosses = _get_glosses(gloss)
76-
terms_renderings[id_].extend(glosses)
73+
renderings_element = term.find("Renderings")
74+
rendering_text = (
75+
renderings_element.text
76+
if renderings_element is not None and renderings_element.text is not None
77+
else ""
78+
)
79+
renderings = _get_renderings(rendering_text)
80+
terms_renderings[id_].extend(renderings)
7781

7882
terms_glosses: Dict[str, List[str]] = defaultdict(list)
7983
if terms_glosses_doc is not None and use_term_glosses:
@@ -102,25 +106,33 @@ def _is_in_category(id: str, term_categories: Sequence[str], term_id_to_category
102106
return not term_categories or (category is not None and category in term_categories)
103107

104108

109+
def _clean_term(term: str):
110+
term = term.strip()
111+
term = _strip_parens(term)
112+
term = " ".join(term.split())
113+
return term
114+
115+
105116
def _get_glosses(gloss: str) -> List[str]:
106117
match = _CONTENT_IN_BRACKETS_REGEX.match(gloss)
107118
if match:
108-
gloss = match.group(0)
109-
gloss = gloss.replace("?", "")
110-
gloss = gloss.replace("*", "")
111-
gloss = gloss.replace("/", " ")
112-
gloss = gloss.strip()
113-
gloss = _strip_parens(gloss)
119+
gloss = match.group(1)
120+
gloss = _clean_term(gloss)
114121
gloss = _strip_parens(gloss, left="[", right="]")
115122
gloss = gloss.strip()
116123
for match in _NUMERICAL_INFORMATION_REGEX.finditer(gloss):
117124
gloss = gloss.replace(match.group(0), "")
118-
glosses = re.split(r"\|\|", gloss)
119-
glosses = [re.split(r"[,;]", g) for g in glosses]
120-
glosses = [item.strip() for sublist in glosses for item in sublist if item.strip()]
125+
glosses = re.split(r"[,;/]", gloss)
126+
glosses = list(set([gloss.strip() for gloss in glosses if gloss.strip()]))
121127
return glosses
122128

123129

130+
def _get_renderings(rendering: str) -> List[str]:
131+
renderings = re.split(r"\|\|", rendering.strip())
132+
renderings = [_clean_term(rendering).strip().replace("*", "") for rendering in renderings]
133+
return [rendering for rendering in renderings if rendering]
134+
135+
124136
def _strip_parens(term_string: str, left: str = "(", right: str = ")") -> str:
125137
parens: int = 0
126138
end: int = -1

machine/corpora/paratext_project_text_updater_base.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from abc import ABC, abstractmethod
2-
from typing import BinaryIO, Iterable, Optional, Sequence, Union
2+
from typing import BinaryIO, Callable, Iterable, Optional, Sequence, Union
33

44
from ..utils.typeshed import StrPath
55
from .paratext_project_settings import ParatextProjectSettings
@@ -11,7 +11,7 @@
1111
UpdateUsfmTextBehavior,
1212
)
1313
from .usfm_parser import parse_usfm
14-
from .usfm_update_block_handler import UsfmUpdateBlockHandler
14+
from .usfm_update_block_handler import UsfmUpdateBlockHandler, UsfmUpdateBlockHandlerException
1515

1616

1717
class ParatextProjectTextUpdaterBase(ABC):
@@ -33,6 +33,8 @@ def update_usfm(
3333
preserve_paragraph_styles: Optional[Union[Iterable[str], str]] = None,
3434
update_block_handlers: Optional[Iterable[UsfmUpdateBlockHandler]] = None,
3535
remarks: Optional[Iterable[str]] = None,
36+
error_handler: Optional[Callable[[UsfmUpdateBlockHandlerException], bool]] = None,
37+
compare_segments: bool = False,
3638
) -> Optional[str]:
3739
file_name: str = self._settings.get_book_file_name(book_id)
3840
if not self._exists(file_name):
@@ -49,6 +51,8 @@ def update_usfm(
4951
preserve_paragraph_styles,
5052
update_block_handlers=update_block_handlers,
5153
remarks=remarks,
54+
error_handler=error_handler,
55+
compare_segments=compare_segments,
5256
)
5357
try:
5458
parse_usfm(usfm, handler, self._settings.stylesheet, self._settings.versification)

machine/corpora/place_markers_usfm_update_block_handler.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from .usfm_token import UsfmToken, UsfmTokenType
88
from .usfm_update_block import UsfmUpdateBlock
99
from .usfm_update_block_element import UsfmUpdateBlockElement, UsfmUpdateBlockElementType
10-
from .usfm_update_block_handler import UsfmUpdateBlockHandler
10+
from .usfm_update_block_handler import UsfmUpdateBlockHandler, UsfmUpdateBlockHandlerException
1111

1212
PLACE_MARKERS_ALIGNMENT_INFO_KEY = "alignment_info"
1313

@@ -118,7 +118,16 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock:
118118
trg_tok_starts = []
119119
prev_len = 0
120120
for tok in trg_toks:
121-
trg_tok_starts.append(trg_sent.index(tok, trg_tok_starts[-1] + prev_len if len(trg_tok_starts) > 0 else 0))
121+
try:
122+
index_of_trg_tok_in_sent = trg_sent.index(
123+
tok, trg_tok_starts[-1] + prev_len if len(trg_tok_starts) > 0 else 0
124+
)
125+
except ValueError:
126+
raise UsfmUpdateBlockHandlerException(
127+
block,
128+
f'No token "{tok}" found in text "{trg_sent}" at or beyond index {trg_tok_starts[-1] + prev_len if len(trg_tok_starts) > 0 else 0}. Is the versification correctly specified?',
129+
)
130+
trg_tok_starts.append(index_of_trg_tok_in_sent)
122131
prev_len = len(tok)
123132

124133
# Predict marker placements and get insertion order

machine/corpora/scripture_ref.py

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from typing import List, Optional
55

66
from ..scripture.constants import ENGLISH_VERSIFICATION
7-
from ..scripture.verse_ref import VerseRef, Versification
7+
from ..scripture.verse_ref import IgnoreSegmentsVerseRef, VerseRef, Versification
88
from ..utils.comparable import Comparable
99
from .scripture_element import ScriptureElement
1010

@@ -112,15 +112,16 @@ def compare_to(self, other: object, compare_segments: bool = True) -> int:
112112
def __eq__(self, other: object) -> bool:
113113
if not isinstance(other, ScriptureRef):
114114
return NotImplemented
115-
return self.verse_ref == other.verse_ref and self.path == other.path
115+
return self.compare_to(other, True) == 0
116116

117117
def __lt__(self, other: object) -> bool:
118118
if not isinstance(other, ScriptureRef):
119119
return NotImplemented
120120
return self.compare_to(other) < 0
121121

122122
def __hash__(self) -> int:
123-
return hash((self.verse_ref, tuple(self.path)))
123+
# Using to_relaxed() is necessary to maintain equality across relaxed refs, __eq__ properly handles relaxed ref comparison
124+
return hash((self.verse_ref, tuple(self.to_relaxed().path)))
124125

125126
def __repr__(self) -> str:
126127
result = str(self.verse_ref)
@@ -129,4 +130,19 @@ def __repr__(self) -> str:
129130
return result
130131

131132

133+
class IgnoreSegmentsScriptureRef(ScriptureRef):
134+
def __eq__(self, other):
135+
if not isinstance(other, ScriptureRef):
136+
return NotImplemented
137+
return self.compare_to(other, False)
138+
139+
def __hash__(self):
140+
return hash(
141+
(
142+
IgnoreSegmentsVerseRef(self.verse_ref),
143+
tuple(self.to_relaxed().path),
144+
)
145+
)
146+
147+
132148
EMPTY_SCRIPTURE_REF = ScriptureRef()

0 commit comments

Comments
 (0)