Skip to content

Commit edcf6c4

Browse files
authored
1 parent 7021586 commit edcf6c4

9 files changed

+302
-63
lines changed

machine/corpora/paratext_project_terms_parser_base.py

Lines changed: 25 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -70,10 +70,14 @@ def parse(self, term_categories: Sequence[str], use_term_glosses: bool = True) -
7070
id = term.attrib["Id"]
7171
if _is_in_category(id, term_categories, term_id_to_category_dict):
7272
id_ = id.replace("\n", "&#xA")
73-
renderings = term.find("Renderings")
74-
gloss = renderings.text if renderings is not None and renderings.text is not None else ""
75-
glosses = _get_glosses(gloss)
76-
terms_renderings[id_].extend(glosses)
73+
renderings_element = term.find("Renderings")
74+
rendering_text = (
75+
renderings_element.text
76+
if renderings_element is not None and renderings_element.text is not None
77+
else ""
78+
)
79+
renderings = _get_renderings(rendering_text)
80+
terms_renderings[id_].extend(renderings)
7781

7882
terms_glosses: Dict[str, List[str]] = defaultdict(list)
7983
if terms_glosses_doc is not None and use_term_glosses:
@@ -102,25 +106,33 @@ def _is_in_category(id: str, term_categories: Sequence[str], term_id_to_category
102106
return not term_categories or (category is not None and category in term_categories)
103107

104108

109+
def _clean_term(term: str):
110+
term = term.strip()
111+
term = _strip_parens(term)
112+
term = " ".join(term.split())
113+
return term
114+
115+
105116
def _get_glosses(gloss: str) -> List[str]:
106117
match = _CONTENT_IN_BRACKETS_REGEX.match(gloss)
107118
if match:
108-
gloss = match.group(0)
109-
gloss = gloss.replace("?", "")
110-
gloss = gloss.replace("*", "")
111-
gloss = gloss.replace("/", " ")
112-
gloss = gloss.strip()
113-
gloss = _strip_parens(gloss)
119+
gloss = match.group(1)
120+
gloss = _clean_term(gloss)
114121
gloss = _strip_parens(gloss, left="[", right="]")
115122
gloss = gloss.strip()
116123
for match in _NUMERICAL_INFORMATION_REGEX.finditer(gloss):
117124
gloss = gloss.replace(match.group(0), "")
118-
glosses = re.split(r"\|\|", gloss)
119-
glosses = [re.split(r"[,;]", g) for g in glosses]
120-
glosses = [item.strip() for sublist in glosses for item in sublist if item.strip()]
125+
glosses = re.split(r"[,;/]", gloss)
126+
glosses = list(set([gloss.strip() for gloss in glosses if gloss.strip()]))
121127
return glosses
122128

123129

130+
def _get_renderings(rendering: str) -> List[str]:
131+
renderings = re.split(r"\|\|", rendering.strip())
132+
renderings = [_clean_term(rendering).strip().replace("*", "") for rendering in renderings]
133+
return [rendering for rendering in renderings if rendering]
134+
135+
124136
def _strip_parens(term_string: str, left: str = "(", right: str = ")") -> str:
125137
parens: int = 0
126138
end: int = -1

machine/corpora/paratext_project_text_updater_base.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from abc import ABC, abstractmethod
2-
from typing import BinaryIO, Iterable, Optional, Sequence, Union
2+
from typing import BinaryIO, Callable, Iterable, Optional, Sequence, Union
33

44
from ..utils.typeshed import StrPath
55
from .paratext_project_settings import ParatextProjectSettings
@@ -11,7 +11,7 @@
1111
UpdateUsfmTextBehavior,
1212
)
1313
from .usfm_parser import parse_usfm
14-
from .usfm_update_block_handler import UsfmUpdateBlockHandler
14+
from .usfm_update_block_handler import UsfmUpdateBlockHandler, UsfmUpdateBlockHandlerError
1515

1616

1717
class ParatextProjectTextUpdaterBase(ABC):
@@ -33,6 +33,8 @@ def update_usfm(
3333
preserve_paragraph_styles: Optional[Union[Iterable[str], str]] = None,
3434
update_block_handlers: Optional[Iterable[UsfmUpdateBlockHandler]] = None,
3535
remarks: Optional[Iterable[str]] = None,
36+
error_handler: Optional[Callable[[UsfmUpdateBlockHandlerError], bool]] = None,
37+
compare_segments: bool = False,
3638
) -> Optional[str]:
3739
file_name: str = self._settings.get_book_file_name(book_id)
3840
if not self._exists(file_name):
@@ -49,6 +51,8 @@ def update_usfm(
4951
preserve_paragraph_styles,
5052
update_block_handlers=update_block_handlers,
5153
remarks=remarks,
54+
error_handler=error_handler,
55+
compare_segments=compare_segments,
5256
)
5357
try:
5458
parse_usfm(usfm, handler, self._settings.stylesheet, self._settings.versification)

machine/corpora/place_markers_usfm_update_block_handler.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from .usfm_token import UsfmToken, UsfmTokenType
88
from .usfm_update_block import UsfmUpdateBlock
99
from .usfm_update_block_element import UsfmUpdateBlockElement, UsfmUpdateBlockElementType
10-
from .usfm_update_block_handler import UsfmUpdateBlockHandler
10+
from .usfm_update_block_handler import UsfmUpdateBlockHandler, UsfmUpdateBlockHandlerError
1111

1212
PLACE_MARKERS_ALIGNMENT_INFO_KEY = "alignment_info"
1313

@@ -118,7 +118,18 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock:
118118
trg_tok_starts = []
119119
prev_len = 0
120120
for tok in trg_toks:
121-
trg_tok_starts.append(trg_sent.index(tok, trg_tok_starts[-1] + prev_len if len(trg_tok_starts) > 0 else 0))
121+
try:
122+
index_of_trg_tok_in_sent = trg_sent.index(
123+
tok, trg_tok_starts[-1] + prev_len if len(trg_tok_starts) > 0 else 0
124+
)
125+
except ValueError:
126+
raise UsfmUpdateBlockHandlerError(
127+
block,
128+
f'No token "{tok}" found in text "{trg_sent}" at or beyond index'
129+
f"{trg_tok_starts[-1] + prev_len if len(trg_tok_starts) > 0 else 0}."
130+
"Is the versification correctly specified?",
131+
)
132+
trg_tok_starts.append(index_of_trg_tok_in_sent)
122133
prev_len = len(tok)
123134

124135
# Predict marker placements and get insertion order

machine/corpora/update_usfm_parser_handler.py

Lines changed: 108 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from enum import Enum, auto
2-
from typing import Iterable, List, Optional, Sequence, Tuple, Union
2+
from typing import Callable, Dict, Iterable, List, Optional, Sequence, Tuple, Union
33

4+
from ..scripture.verse_ref import IgnoreSegmentsVerseRef, VerseRef, Versification
45
from .scripture_ref import ScriptureRef
56
from .scripture_ref_usfm_parser_handler import ScriptureRefUsfmParserHandler, ScriptureTextType
67
from .usfm_parser_state import UsfmParserState
@@ -10,7 +11,7 @@
1011
from .usfm_tokenizer import UsfmTokenizer
1112
from .usfm_update_block import UsfmUpdateBlock
1213
from .usfm_update_block_element import UsfmUpdateBlockElement, UsfmUpdateBlockElementType
13-
from .usfm_update_block_handler import UsfmUpdateBlockHandler
14+
from .usfm_update_block_handler import UsfmUpdateBlockHandler, UsfmUpdateBlockHandlerError
1415

1516

1617
class UpdateUsfmTextBehavior(Enum):
@@ -24,6 +25,12 @@ class UpdateUsfmMarkerBehavior(Enum):
2425
STRIP = auto()
2526

2627

28+
class _RowInfo:
29+
def __init__(self, row_index: int):
30+
self.row_index = row_index
31+
self.is_consumed = False
32+
33+
2734
class UpdateUsfmRow:
2835
def __init__(self, refs: Sequence[ScriptureRef], text: str, metadata: Optional[dict[str, object]] = None):
2936
self.refs = refs
@@ -43,9 +50,19 @@ def __init__(
4350
preserve_paragraph_styles: Optional[Union[Iterable[str], str]] = None,
4451
update_block_handlers: Optional[Iterable[UsfmUpdateBlockHandler]] = None,
4552
remarks: Optional[Iterable[str]] = None,
53+
error_handler: Optional[Callable[[UsfmUpdateBlockHandlerError], bool]] = None,
54+
compare_segments: bool = False,
4655
) -> None:
4756
super().__init__()
4857
self._rows = rows or []
58+
self._verse_rows: List[int] = []
59+
self._verse_row_index = 0
60+
self._verse_rows_map: Dict[VerseRef, List[_RowInfo]] = {}
61+
self._verse_rows_ref = VerseRef()
62+
if len(self._rows) > 0:
63+
self._update_rows_versification: Versification = self._rows[0].refs[0].versification
64+
else:
65+
self._update_rows_versification = Versification.get_builtin("English")
4966
self._tokens: List[UsfmToken] = []
5067
self._updated_text: List[UsfmToken] = []
5168
self._update_block_stack: list[UsfmUpdateBlock] = []
@@ -65,6 +82,11 @@ def __init__(
6582
self._remarks = []
6683
else:
6784
self._remarks = list(remarks)
85+
if error_handler is None:
86+
self._error_handler = lambda _: False
87+
else:
88+
self._error_handler = error_handler
89+
self._compare_segments = compare_segments
6890
self._text_behavior = text_behavior
6991
self._paragraph_behavior = paragraph_behavior
7092
self._embed_behavior = embed_behavior
@@ -82,6 +104,10 @@ def end_usfm(self, state: UsfmParserState) -> None:
82104
super().end_usfm(state)
83105

84106
def start_book(self, state: UsfmParserState, marker: str, code: str) -> None:
107+
self._verse_rows_ref = state.verse_ref.copy()
108+
self._update_verse_rows_map()
109+
self._update_verse_rows()
110+
85111
self._collect_readonly_tokens(state)
86112
self._update_block_stack.append(UsfmUpdateBlock())
87113
start_book_tokens: List[UsfmToken] = []
@@ -108,7 +134,7 @@ def start_para(
108134
) -> None:
109135
if state.is_verse_text:
110136
# Only strip paragraph markers in a verse
111-
if self._paragraph_behavior == UpdateUsfmMarkerBehavior.PRESERVE:
137+
if self._paragraph_behavior == UpdateUsfmMarkerBehavior.PRESERVE and not self._duplicate_verse:
112138
self._collect_updatable_tokens(state)
113139
else:
114140
self._skip_updatable_tokens(state)
@@ -148,6 +174,11 @@ def chapter(
148174
) -> None:
149175
self._use_updated_text()
150176

177+
if self._verse_rows_ref != state.verse_ref:
178+
self._verse_rows_ref = state.verse_ref.copy()
179+
self._update_verse_rows_map()
180+
self._update_verse_rows()
181+
151182
super().chapter(state, number, marker, alt_number, pub_number)
152183

153184
self._collect_readonly_tokens(state)
@@ -179,14 +210,23 @@ def verse(
179210
if last_paragraph is not None:
180211
last_paragraph.marked_for_removal = False
181212

182-
super().verse(state, number, marker, alt_number, pub_number)
213+
if self._verse_rows_ref != state.verse_ref:
214+
self._verse_rows_ref = state.verse_ref.copy()
215+
self._update_verse_rows()
183216

184-
self._collect_readonly_tokens(state)
217+
super().verse(state, number, marker, alt_number, pub_number)
218+
if self._duplicate_verse:
219+
self._skip_updatable_tokens(state)
220+
else:
221+
self._collect_readonly_tokens(state)
185222

186223
def start_note(self, state: UsfmParserState, marker: str, caller: str, category: str) -> None:
187224
super().start_note(state, marker, caller, category)
188225

189-
self._collect_updatable_tokens(state)
226+
if not self._duplicate_verse:
227+
self._collect_updatable_tokens(state)
228+
else:
229+
self._skip_updatable_tokens(state)
190230

191231
def end_note(self, state: UsfmParserState, marker: str, closed: bool) -> None:
192232
if closed:
@@ -219,15 +259,14 @@ def end_char(
219259
attributes: Sequence[UsfmAttribute],
220260
closed: bool,
221261
) -> None:
222-
if closed:
223-
if self._current_text_type == ScriptureTextType.EMBED:
224-
self._collect_updatable_tokens(state)
262+
if self._current_text_type == ScriptureTextType.EMBED:
263+
self._collect_updatable_tokens(state)
264+
else:
265+
self._replace_with_new_tokens(state)
266+
if self._style_behavior == UpdateUsfmMarkerBehavior.STRIP:
267+
self._skip_updatable_tokens(state)
225268
else:
226-
self._replace_with_new_tokens(state)
227-
if self._style_behavior == UpdateUsfmMarkerBehavior.STRIP:
228-
self._skip_updatable_tokens(state)
229-
else:
230-
self._collect_updatable_tokens(state)
269+
self._collect_updatable_tokens(state)
231270

232271
super().end_char(state, marker, attributes, closed)
233272

@@ -242,7 +281,9 @@ def ref(self, state: UsfmParserState, marker: str, display: str, target: str) ->
242281
def text(self, state: UsfmParserState, text: str) -> None:
243282
super().text(state, text)
244283

245-
if self._replace_with_new_tokens(state):
284+
if self._replace_with_new_tokens(state) or (
285+
self._duplicate_verse and self._current_text_type == ScriptureTextType.VERSE
286+
):
246287
self._skip_updatable_tokens(state)
247288
else:
248289
self._collect_updatable_tokens(state)
@@ -292,11 +333,10 @@ def get_usfm(self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty") -> str:
292333
for remark in self._remarks:
293334
remark_tokens.append(UsfmToken(UsfmTokenType.PARAGRAPH, "rem"))
294335
remark_tokens.append(UsfmToken(UsfmTokenType.TEXT, text=remark))
295-
if len(tokens) > 0 and tokens[0].marker == "id":
296-
index = 1
297-
if len(tokens) > 1 and tokens[1].type == UsfmTokenType.TEXT:
298-
index = 2
299-
while tokens[index].marker == "rem":
336+
if len(tokens) > 0:
337+
index = 0
338+
markers_to_skip = {"id", "ide", "rem"}
339+
while tokens[index].marker in markers_to_skip:
300340
index += 1
301341
if len(tokens) > index and tokens[index].type == UsfmTokenType.TEXT:
302342
index += 1
@@ -308,13 +348,15 @@ def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> Tuple[List[str]
308348
row_texts: List[str] = []
309349
row_metadata = None
310350
source_index: int = 0
311-
while self._row_index < len(self._rows) and source_index < len(seg_scr_refs):
351+
while self._verse_row_index < len(self._verse_rows) and source_index < len(seg_scr_refs):
312352
compare: int = 0
313-
row = self._rows[self._row_index]
353+
row = self._rows[self._verse_rows[self._verse_row_index]]
314354
row_scr_refs, text, metadata = row.refs, row.text, row.metadata
315355
for row_scr_ref in row_scr_refs:
316356
while source_index < len(seg_scr_refs):
317-
compare = row_scr_ref.compare_to(seg_scr_refs[source_index], compare_segments=False)
357+
compare = row_scr_ref.compare_to(
358+
seg_scr_refs[source_index], compare_segments=self._compare_segments
359+
)
318360
if compare > 0:
319361
# row is ahead of source, increment source
320362
source_index += 1
@@ -328,7 +370,7 @@ def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> Tuple[List[str]
328370
break
329371
if compare <= 0:
330372
# source is ahead of row, increment row
331-
self._row_index += 1
373+
self._verse_row_index += 1
332374
return row_texts, row_metadata
333375

334376
def _collect_updatable_tokens(self, state: UsfmParserState) -> None:
@@ -418,7 +460,13 @@ def _end_update_block(self, state: UsfmParserState, scripture_refs: Sequence[Scr
418460
para_elems.append(update_block.pop())
419461

420462
for handler in self._update_block_handlers:
421-
update_block = handler.process_block(update_block)
463+
try:
464+
update_block = handler.process_block(update_block)
465+
except UsfmUpdateBlockHandlerError as e:
466+
should_continue = self._error_handler(e)
467+
if not should_continue:
468+
raise
469+
422470
tokens = update_block.get_tokens()
423471
for elem in reversed(para_elems):
424472
tokens.extend(elem.get_tokens())
@@ -449,6 +497,41 @@ def _pop_new_tokens(self) -> None:
449497
def _is_in_preserved_paragraph(self, state: UsfmParserState) -> bool:
450498
return state.para_tag is not None and state.para_tag.marker in self._preserve_paragraph_styles
451499

500+
def _update_verse_rows_map(self) -> None:
501+
self._verse_rows_map.clear()
502+
while (
503+
self._row_index < len(self._rows)
504+
and self._rows[self._row_index].refs[0].chapter_num == self._verse_rows_ref.chapter_num
505+
):
506+
row = self._rows[self._row_index]
507+
ri = _RowInfo(self._row_index)
508+
for sr in row.refs:
509+
vr = sr.verse_ref if self._compare_segments else IgnoreSegmentsVerseRef(sr.verse_ref)
510+
if vr in self._verse_rows_map:
511+
self._verse_rows_map[vr].append(ri)
512+
else:
513+
self._verse_rows_map[vr] = [ri]
514+
self._row_index += 1
515+
516+
def _update_verse_rows(self) -> None:
517+
vref = self._verse_rows_ref.copy()
518+
# We are using a dictionary, which uses an equality comparer. As a result, we need to change the
519+
# source verse ref to use the row versification. If we used a SortedList, it wouldn't be necessary, but it
520+
# would be less efficient.
521+
vref.change_versification(self._update_rows_versification)
522+
523+
self._verse_rows.clear()
524+
self._verse_row_index = 0
525+
526+
for vr in vref.all_verses():
527+
if not self._compare_segments:
528+
vr = IgnoreSegmentsVerseRef(vr)
529+
if rows := self._verse_rows_map.get(vr):
530+
for row in rows:
531+
if not row.is_consumed:
532+
self._verse_rows.append(row.row_index)
533+
row.is_consumed = True
534+
452535

453536
def _is_nonverse_paragraph(state: UsfmParserState, element: UsfmUpdateBlockElement) -> bool:
454537
if element.type != UsfmUpdateBlockElementType.PARAGRAPH:

machine/corpora/usfm_update_block_handler.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,13 @@
66
class UsfmUpdateBlockHandler(ABC):
77
@abstractmethod
88
def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock: ...
9+
10+
11+
class UsfmUpdateBlockHandlerError(Exception):
12+
def __init__(self, block: UsfmUpdateBlock, *args):
13+
self._block = block
14+
super().__init__(*args)
15+
16+
@property
17+
def block(self):
18+
return self._block

0 commit comments

Comments
 (0)