Skip to content

Commit 11d3611

Browse files
authored
Allow passing a selection of books and chapters to the PT quote detector (#236)
* Allow passing a selection of books and chapters to the PT quote detector * Fix get_scripture_books to return DCs
1 parent edcf6c4 commit 11d3611

File tree

7 files changed

+189
-27
lines changed

7 files changed

+189
-27
lines changed

machine/punctuation_analysis/paratext_project_quote_convention_detector.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
from abc import ABC, abstractmethod
2-
from typing import BinaryIO, Optional, Union
2+
from typing import BinaryIO, Dict, List, Optional, Union
33

44
from ..corpora.paratext_project_settings import ParatextProjectSettings
55
from ..corpora.paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
66
from ..corpora.usfm_parser import parse_usfm
7+
from ..scripture.canon import book_id_to_number, get_scripture_books
78
from ..utils.typeshed import StrPath
89
from .quote_convention_detector import QuoteConventionAnalysis, QuoteConventionDetector
910

@@ -16,10 +17,13 @@ def __init__(self, settings: Union[ParatextProjectSettings, ParatextProjectSetti
1617
self._settings = settings
1718

1819
def get_quote_convention_analysis(
19-
self, handler: Optional[QuoteConventionDetector] = None
20+
self, handler: Optional[QuoteConventionDetector] = None, include_chapters: Optional[Dict[int, List[int]]] = None
2021
) -> Optional[QuoteConventionAnalysis]:
2122
handler = QuoteConventionDetector() if handler is None else handler
22-
for file_name in self._settings.get_all_scripture_book_file_names():
23+
for book_id in get_scripture_books():
24+
if include_chapters is not None and book_id_to_number(book_id) not in include_chapters:
25+
continue
26+
file_name: str = self._settings.get_book_file_name(book_id)
2327
if not self._exists(file_name):
2428
continue
2529
with self._open(file_name) as sfm_file:
@@ -33,7 +37,7 @@ def get_quote_convention_analysis(
3337
f". Error: '{e}'"
3438
)
3539
raise RuntimeError(error_message) from e
36-
return handler.detect_quote_convention()
40+
return handler.detect_quote_convention(include_chapters)
3741

3842
@abstractmethod
3943
def _exists(self, file_name: StrPath) -> bool: ...

machine/punctuation_analysis/quote_convention_detector.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from dataclasses import dataclass
2-
from typing import List, Optional
2+
from typing import Dict, List, Optional
33

44
from .chapter import Chapter
55
from .depth_based_quotation_mark_resolver import DepthBasedQuotationMarkResolver
@@ -51,8 +51,10 @@ def _count_quotation_marks_in_chapter(
5151

5252
self._quotation_mark_tabulator.tabulate(resolved_quotation_marks)
5353

54-
def detect_quote_convention(self) -> Optional[QuoteConventionAnalysis]:
55-
self._count_quotation_marks_in_chapters(self.get_chapters())
54+
def detect_quote_convention(
55+
self, include_chapters: Optional[Dict[int, List[int]]] = None
56+
) -> Optional[QuoteConventionAnalysis]:
57+
self._count_quotation_marks_in_chapters(self.get_chapters(include_chapters))
5658

5759
(best_quote_convention, score) = STANDARD_QUOTE_CONVENTIONS.find_most_similar_convention(
5860
self._quotation_mark_tabulator

machine/punctuation_analysis/text_segment.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
class TextSegment:
88
def __init__(self):
99
self._text = ""
10+
self.book: Optional[str] = None
11+
self.chapter: Optional[int] = None
1012
self._immediate_preceding_marker: UsfmMarkerType = UsfmMarkerType.NO_MARKER
1113
self._markers_in_preceding_context: Set[UsfmMarkerType] = set()
1214
self.previous_segment: Optional[TextSegment] = None
@@ -71,6 +73,14 @@ def add_preceding_marker(self, marker: UsfmMarkerType) -> "TextSegment.Builder":
7173
self._text_segment._markers_in_preceding_context.add(marker)
7274
return self
7375

76+
def set_book(self, code: str) -> "TextSegment.Builder":
77+
self._text_segment.book = code
78+
return self
79+
80+
def set_chapter(self, number: int) -> "TextSegment.Builder":
81+
self._text_segment.chapter = number
82+
return self
83+
7484
def set_usfm_token(self, token: UsfmToken) -> "TextSegment.Builder":
7585
self._text_segment._usfm_token = token
7686
return self

machine/punctuation_analysis/usfm_structure_extractor.py

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
1-
from typing import Optional, Sequence
1+
from typing import Dict, List, Optional, Sequence
22

33
from ..corpora.usfm_parser_handler import UsfmParserHandler
44
from ..corpora.usfm_parser_state import UsfmParserState
55
from ..corpora.usfm_token import UsfmAttribute
6+
from ..scripture.canon import book_id_to_number
67
from .chapter import Chapter
78
from .text_segment import TextSegment
89
from .usfm_marker_type import UsfmMarkerType
@@ -14,6 +15,9 @@ def __init__(self):
1415
self._text_segments: list[TextSegment] = []
1516
self._next_text_segment_builder: TextSegment.Builder = TextSegment.Builder()
1617

18+
def start_book(self, state: UsfmParserState, marker: str, code: str) -> None:
19+
self._next_text_segment_builder.set_book(code)
20+
1721
def chapter(
1822
self,
1923
state: UsfmParserState,
@@ -23,6 +27,8 @@ def chapter(
2327
pub_number: Optional[str],
2428
) -> None:
2529
self._next_text_segment_builder.add_preceding_marker(UsfmMarkerType.CHAPTER)
30+
chapter_number: int = int(number) if number.isdigit() else 0
31+
self._next_text_segment_builder.set_chapter(chapter_number)
2632

2733
def start_para(
2834
self,
@@ -79,11 +85,26 @@ def text(self, state: UsfmParserState, text: str) -> None:
7985
self._text_segments.append(text_segment)
8086
self._next_text_segment_builder = TextSegment.Builder()
8187

82-
def get_chapters(self) -> list[Chapter]:
88+
def get_chapters(self, include_chapters: Optional[Dict[int, List[int]]] = None) -> list[Chapter]:
8389
chapters: list[Chapter] = []
90+
current_book: int = 0
91+
current_chapter: int = 0
8492
current_chapter_verses: list[Verse] = []
8593
current_verse_segments: list[TextSegment] = []
8694
for text_segment in self._text_segments:
95+
if text_segment.book is not None:
96+
current_book = book_id_to_number(text_segment.book)
97+
if text_segment.chapter is not None:
98+
current_chapter = text_segment.chapter
99+
if include_chapters is not None and current_book > 0:
100+
if current_book not in include_chapters:
101+
continue
102+
elif (
103+
current_chapter > 0
104+
and len(include_chapters[current_book]) > 0
105+
and current_chapter not in include_chapters[current_book]
106+
):
107+
continue
87108
if text_segment.marker_is_in_preceding_context(UsfmMarkerType.VERSE):
88109
if len(current_verse_segments) > 0:
89110
current_chapter_verses.append(Verse(current_verse_segments))

machine/scripture/canon.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -184,4 +184,4 @@ def is_canonical(book: Union[str, int]) -> bool:
184184

185185

186186
def get_scripture_books() -> Iterable[str]:
187-
return list(map(lambda kvp: kvp[0], filter(lambda kvp: is_ot_nt(kvp[1]), BOOK_NUMBERS.items())))
187+
return list(map(lambda kvp: kvp[0], filter(lambda kvp: is_canonical(kvp[1]), BOOK_NUMBERS.items())))

tests/punctuation_analysis/test_paratext_project_quote_convention_detector.py

Lines changed: 96 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,29 @@
1-
from typing import Dict, Optional
1+
from typing import Dict, List, Optional
22

33
from testutils.memory_paratext_project_quote_convention_detector import MemoryParatextProjectQuoteConventionDetector
44

55
from machine.corpora import ParatextProjectSettings, UsfmStylesheet
6-
from machine.punctuation_analysis import ParatextProjectQuoteConventionDetector, QuoteConventionAnalysis
7-
from machine.scripture import ORIGINAL_VERSIFICATION, Versification
6+
from machine.punctuation_analysis import (
7+
STANDARD_QUOTE_CONVENTIONS,
8+
ParatextProjectQuoteConventionDetector,
9+
QuoteConvention,
10+
QuoteConventionAnalysis,
11+
)
12+
from machine.scripture import ORIGINAL_VERSIFICATION, Versification, get_chapters
13+
14+
standard_english_quote_convention: Optional[QuoteConvention] = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name(
15+
"standard_english"
16+
)
17+
standard_french_quote_convention: Optional[QuoteConvention] = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name(
18+
"standard_french"
19+
)
820

921

1022
def test_get_quote_convention() -> None:
1123
env = _TestEnvironment(
1224
files={
13-
"41MATTest.SFM": r"""\id MAT
14-
\c 1
15-
\v 1 Someone said, “This is something I am saying!
16-
\v 2 This is also something I am saying” (that is, “something I am speaking”).
17-
\p
18-
\v 3 Other text, and someone else said,
19-
\q1
20-
\v 4 “Things
21-
\q2 someone else said!
22-
\q3 and more things someone else said.”
23-
\m That is why he said “things someone else said.”
24-
\v 5 Then someone said, “More things someone said.”""",
25+
"41MATTest.SFM": rf"""\id MAT
26+
{get_test_chapter(1, standard_english_quote_convention)}""",
2527
}
2628
)
2729
analysis: Optional[QuoteConventionAnalysis] = env.get_quote_convention()
@@ -30,6 +32,64 @@ def test_get_quote_convention() -> None:
3032
assert analysis.best_quote_convention.name == "standard_english"
3133

3234

35+
def test_get_quote_convention_by_book() -> None:
36+
env = _TestEnvironment(
37+
files={
38+
"41MATTest.SFM": rf"""\id MAT
39+
{get_test_chapter(1, standard_english_quote_convention)}""",
40+
"42MRKTest.SFM": rf"""\id MRK
41+
{get_test_chapter(1, standard_french_quote_convention)}""",
42+
}
43+
)
44+
analysis: Optional[QuoteConventionAnalysis] = env.get_quote_convention("MRK")
45+
assert analysis is not None
46+
assert analysis.best_quote_convention_score > 0.8
47+
assert analysis.best_quote_convention.name == "standard_french"
48+
49+
50+
def test_get_quote_convention_by_chapter() -> None:
51+
env = _TestEnvironment(
52+
files={
53+
"41MATTest.SFM": rf"""\id MAT
54+
{get_test_chapter(1, standard_english_quote_convention)}""",
55+
"42MRKTest.SFM": rf"""\id MRK
56+
{get_test_chapter(1, standard_english_quote_convention)}
57+
{get_test_chapter(2, standard_french_quote_convention)}
58+
{get_test_chapter(3, standard_english_quote_convention)}
59+
{get_test_chapter(4, standard_english_quote_convention)}
60+
{get_test_chapter(5, standard_french_quote_convention)}""",
61+
}
62+
)
63+
analysis: Optional[QuoteConventionAnalysis] = env.get_quote_convention("MRK2,4-5")
64+
assert analysis is not None
65+
assert analysis.best_quote_convention_score > 0.66
66+
assert analysis.best_quote_convention.name == "standard_french"
67+
68+
69+
def test_get_quote_convention_by_chapter_indeterminate() -> None:
70+
env = _TestEnvironment(
71+
files={
72+
"41MATTest.SFM": rf"""\id MAT
73+
{get_test_chapter(1, None)}
74+
{get_test_chapter(2, standard_english_quote_convention)}
75+
{get_test_chapter(3, None)}""",
76+
}
77+
)
78+
analysis: Optional[QuoteConventionAnalysis] = env.get_quote_convention("MAT1,3")
79+
assert analysis is None
80+
81+
82+
def test_get_quote_convention_invalid_book_code() -> None:
83+
env = _TestEnvironment(
84+
files={
85+
"41MATTest.SFM": rf"""\id LUK
86+
{get_test_chapter(1, standard_english_quote_convention)}""",
87+
}
88+
)
89+
analysis: Optional[QuoteConventionAnalysis] = env.get_quote_convention("MAT")
90+
assert analysis is None
91+
92+
3393
class _TestEnvironment:
3494
def __init__(
3595
self,
@@ -44,8 +104,27 @@ def __init__(
44104
def detector(self) -> ParatextProjectQuoteConventionDetector:
45105
return self._detector
46106

47-
def get_quote_convention(self) -> Optional[QuoteConventionAnalysis]:
48-
return self.detector.get_quote_convention_analysis()
107+
def get_quote_convention(self, scripture_range: Optional[str] = None) -> Optional[QuoteConventionAnalysis]:
108+
chapters: Optional[Dict[int, List[int]]] = None
109+
if scripture_range is not None:
110+
chapters = get_chapters(scripture_range, ORIGINAL_VERSIFICATION)
111+
return self.detector.get_quote_convention_analysis(include_chapters=chapters)
112+
113+
114+
def get_test_chapter(number: int, quote_convention: Optional[QuoteConvention]) -> str:
115+
left_quote: str = quote_convention.get_opening_quotation_mark_at_depth(1) if quote_convention else ""
116+
right_quote: str = quote_convention.get_closing_quotation_mark_at_depth(1) if quote_convention else ""
117+
return rf"""\c {number}
118+
\v 1 Someone said, {left_quote}This is something I am saying!
119+
\v 2 This is also something I am saying{right_quote} (that is, {left_quote}something I am speaking{right_quote}).
120+
\p
121+
\v 3 Other text, and someone else said,
122+
\q1
123+
\v 4 {left_quote}Things
124+
\q2 someone else said!
125+
\q3 and more things someone else said.{right_quote}
126+
\m That is why he said {left_quote}things someone else said.{right_quote}
127+
\v 5 Then someone said, {left_quote}More things someone said.{right_quote}"""
49128

50129

51130
class _DefaultParatextProjectSettings(ParatextProjectSettings):

tests/punctuation_analysis/test_usfm_structure_extractor.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,52 @@
77
verse_text_parser_state.verse_ref.verse_num = 1
88

99

10+
def test_get_chapters_filter_by_book():
11+
usfm_structure_extractor = UsfmStructureExtractor()
12+
usfm_structure_extractor.start_book(verse_text_parser_state, "id", "GEN")
13+
usfm_structure_extractor.chapter(verse_text_parser_state, "1", "c", None, None)
14+
usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None)
15+
usfm_structure_extractor.text(verse_text_parser_state, "test")
16+
17+
actual_chapters = usfm_structure_extractor.get_chapters({2: [1]}) # EXO 1
18+
assert len(actual_chapters) == 0
19+
20+
21+
def test_get_chapters_filter_by_chapter():
22+
usfm_structure_extractor = UsfmStructureExtractor()
23+
usfm_structure_extractor.start_book(verse_text_parser_state, "id", "MAT")
24+
usfm_structure_extractor.chapter(verse_text_parser_state, "1", "c", None, None)
25+
usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None)
26+
usfm_structure_extractor.text(verse_text_parser_state, "test")
27+
usfm_structure_extractor.chapter(verse_text_parser_state, "2", "c", None, None)
28+
usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None)
29+
usfm_structure_extractor.text(verse_text_parser_state, "test2")
30+
usfm_structure_extractor.chapter(verse_text_parser_state, "3", "c", None, None)
31+
usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None)
32+
usfm_structure_extractor.text(verse_text_parser_state, "test3")
33+
34+
expected_chapters = [
35+
Chapter(
36+
[
37+
Verse(
38+
[
39+
TextSegment.Builder()
40+
.set_text("test2")
41+
.add_preceding_marker(UsfmMarkerType.CHAPTER)
42+
.add_preceding_marker(UsfmMarkerType.VERSE)
43+
.build()
44+
]
45+
)
46+
]
47+
)
48+
]
49+
50+
actual_chapters = usfm_structure_extractor.get_chapters({40: [2]})
51+
assert_chapter_equal(expected_chapters, actual_chapters)
52+
assert actual_chapters[0].verses[0]._text_segments[0].previous_segment is None
53+
assert actual_chapters[0].verses[0]._text_segments[0].next_segment is None
54+
55+
1056
def test_chapter_and_verse_markers():
1157
usfm_structure_extractor = UsfmStructureExtractor()
1258
usfm_structure_extractor.chapter(verse_text_parser_state, "1", "c", None, None)

0 commit comments

Comments
 (0)