Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 25 additions & 13 deletions machine/corpora/paratext_project_terms_parser_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,10 +70,14 @@ def parse(self, term_categories: Sequence[str], use_term_glosses: bool = True) -
id = term.attrib["Id"]
if _is_in_category(id, term_categories, term_id_to_category_dict):
id_ = id.replace("\n", "&#xA")
renderings = term.find("Renderings")
gloss = renderings.text if renderings is not None and renderings.text is not None else ""
glosses = _get_glosses(gloss)
terms_renderings[id_].extend(glosses)
renderings_element = term.find("Renderings")
rendering_text = (
renderings_element.text
if renderings_element is not None and renderings_element.text is not None
else ""
)
renderings = _get_renderings(rendering_text)
terms_renderings[id_].extend(renderings)

terms_glosses: Dict[str, List[str]] = defaultdict(list)
if terms_glosses_doc is not None and use_term_glosses:
Expand Down Expand Up @@ -102,25 +106,33 @@ def _is_in_category(id: str, term_categories: Sequence[str], term_id_to_category
return not term_categories or (category is not None and category in term_categories)


def _clean_term(term: str):
term = term.strip()
term = _strip_parens(term)
term = " ".join(term.split())
return term


def _get_glosses(gloss: str) -> List[str]:
match = _CONTENT_IN_BRACKETS_REGEX.match(gloss)
if match:
gloss = match.group(0)
gloss = gloss.replace("?", "")
gloss = gloss.replace("*", "")
gloss = gloss.replace("/", " ")
gloss = gloss.strip()
gloss = _strip_parens(gloss)
gloss = match.group(1)
gloss = _clean_term(gloss)
gloss = _strip_parens(gloss, left="[", right="]")
gloss = gloss.strip()
for match in _NUMERICAL_INFORMATION_REGEX.finditer(gloss):
gloss = gloss.replace(match.group(0), "")
glosses = re.split(r"\|\|", gloss)
glosses = [re.split(r"[,;]", g) for g in glosses]
glosses = [item.strip() for sublist in glosses for item in sublist if item.strip()]
glosses = re.split(r"[,;/]", gloss)
glosses = list(set([gloss.strip() for gloss in glosses if gloss.strip()]))
return glosses


def _get_renderings(rendering: str) -> List[str]:
renderings = re.split(r"\|\|", rendering.strip())
renderings = [_clean_term(rendering).strip().replace("*", "") for rendering in renderings]
return [rendering for rendering in renderings if rendering]


def _strip_parens(term_string: str, left: str = "(", right: str = ")") -> str:
parens: int = 0
end: int = -1
Expand Down
8 changes: 6 additions & 2 deletions machine/corpora/paratext_project_text_updater_base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from abc import ABC, abstractmethod
from typing import BinaryIO, Iterable, Optional, Sequence, Union
from typing import BinaryIO, Callable, Iterable, Optional, Sequence, Union

from ..utils.typeshed import StrPath
from .paratext_project_settings import ParatextProjectSettings
Expand All @@ -11,7 +11,7 @@
UpdateUsfmTextBehavior,
)
from .usfm_parser import parse_usfm
from .usfm_update_block_handler import UsfmUpdateBlockHandler
from .usfm_update_block_handler import UsfmUpdateBlockHandler, UsfmUpdateBlockHandlerError


class ParatextProjectTextUpdaterBase(ABC):
Expand All @@ -33,6 +33,8 @@ def update_usfm(
preserve_paragraph_styles: Optional[Union[Iterable[str], str]] = None,
update_block_handlers: Optional[Iterable[UsfmUpdateBlockHandler]] = None,
remarks: Optional[Iterable[str]] = None,
error_handler: Optional[Callable[[UsfmUpdateBlockHandlerError], bool]] = None,
compare_segments: bool = False,
) -> Optional[str]:
file_name: str = self._settings.get_book_file_name(book_id)
if not self._exists(file_name):
Expand All @@ -49,6 +51,8 @@ def update_usfm(
preserve_paragraph_styles,
update_block_handlers=update_block_handlers,
remarks=remarks,
error_handler=error_handler,
compare_segments=compare_segments,
)
try:
parse_usfm(usfm, handler, self._settings.stylesheet, self._settings.versification)
Expand Down
15 changes: 13 additions & 2 deletions machine/corpora/place_markers_usfm_update_block_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from .usfm_token import UsfmToken, UsfmTokenType
from .usfm_update_block import UsfmUpdateBlock
from .usfm_update_block_element import UsfmUpdateBlockElement, UsfmUpdateBlockElementType
from .usfm_update_block_handler import UsfmUpdateBlockHandler
from .usfm_update_block_handler import UsfmUpdateBlockHandler, UsfmUpdateBlockHandlerError

PLACE_MARKERS_ALIGNMENT_INFO_KEY = "alignment_info"

Expand Down Expand Up @@ -118,7 +118,18 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock:
trg_tok_starts = []
prev_len = 0
for tok in trg_toks:
trg_tok_starts.append(trg_sent.index(tok, trg_tok_starts[-1] + prev_len if len(trg_tok_starts) > 0 else 0))
try:
index_of_trg_tok_in_sent = trg_sent.index(
tok, trg_tok_starts[-1] + prev_len if len(trg_tok_starts) > 0 else 0
)
except ValueError:
raise UsfmUpdateBlockHandlerError(
block,
f'No token "{tok}" found in text "{trg_sent}" at or beyond index'
f"{trg_tok_starts[-1] + prev_len if len(trg_tok_starts) > 0 else 0}."
"Is the versification correctly specified?",
)
trg_tok_starts.append(index_of_trg_tok_in_sent)
prev_len = len(tok)

# Predict marker placements and get insertion order
Expand Down
23 changes: 20 additions & 3 deletions machine/corpora/scripture_ref.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from typing import List, Optional

from ..scripture.constants import ENGLISH_VERSIFICATION
from ..scripture.verse_ref import VerseRef, Versification
from ..scripture.verse_ref import IgnoreSegmentsVerseRef, VerseRef, Versification
from ..utils.comparable import Comparable
from .scripture_element import ScriptureElement

Expand Down Expand Up @@ -112,15 +112,17 @@ def compare_to(self, other: object, compare_segments: bool = True) -> int:
def __eq__(self, other: object) -> bool:
if not isinstance(other, ScriptureRef):
return NotImplemented
return self.verse_ref == other.verse_ref and self.path == other.path
return self.compare_to(other, True) == 0

def __lt__(self, other: object) -> bool:
if not isinstance(other, ScriptureRef):
return NotImplemented
return self.compare_to(other) < 0

def __hash__(self) -> int:
return hash((self.verse_ref, tuple(self.path)))
# Using to_relaxed() is necessary to maintain equality across relaxed refs,
# __eq__ properly handles relaxed ref comparison
return hash((self.verse_ref, tuple(self.to_relaxed().path)))

def __repr__(self) -> str:
result = str(self.verse_ref)
Expand All @@ -129,4 +131,19 @@ def __repr__(self) -> str:
return result


class IgnoreSegmentsScriptureRef(ScriptureRef):
def __eq__(self, other):
if not isinstance(other, ScriptureRef):
return NotImplemented
return self.compare_to(other, False)

def __hash__(self):
return hash(
(
IgnoreSegmentsVerseRef(self.verse_ref),
tuple(self.to_relaxed().path),
)
)


EMPTY_SCRIPTURE_REF = ScriptureRef()
Loading