Skip to content

Commit 8fdb273

Browse files
committed
Allow passing a selection of books and chapters to the PT quote detector
1 parent 7021586 commit 8fdb273

File tree

5 files changed

+113
-24
lines changed

5 files changed

+113
-24
lines changed

machine/punctuation_analysis/paratext_project_quote_convention_detector.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
from abc import ABC, abstractmethod
2-
from typing import BinaryIO, Optional, Union
2+
from typing import BinaryIO, Dict, List, Optional, Union
33

44
from ..corpora.paratext_project_settings import ParatextProjectSettings
55
from ..corpora.paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
66
from ..corpora.usfm_parser import parse_usfm
7+
from ..scripture.canon import book_id_to_number, get_scripture_books
78
from ..utils.typeshed import StrPath
89
from .quote_convention_detector import QuoteConventionAnalysis, QuoteConventionDetector
910

@@ -16,10 +17,13 @@ def __init__(self, settings: Union[ParatextProjectSettings, ParatextProjectSetti
1617
self._settings = settings
1718

1819
def get_quote_convention_analysis(
19-
self, handler: Optional[QuoteConventionDetector] = None
20+
self, handler: Optional[QuoteConventionDetector] = None, include_chapters: Optional[Dict[int, List[int]]] = None
2021
) -> Optional[QuoteConventionAnalysis]:
2122
handler = QuoteConventionDetector() if handler is None else handler
22-
for file_name in self._settings.get_all_scripture_book_file_names():
23+
for book_id in get_scripture_books():
24+
if include_chapters is not None and book_id_to_number(book_id) not in include_chapters:
25+
continue
26+
file_name: str = self._settings.get_book_file_name(book_id)
2327
if not self._exists(file_name):
2428
continue
2529
with self._open(file_name) as sfm_file:
@@ -33,7 +37,7 @@ def get_quote_convention_analysis(
3337
f". Error: '{e}'"
3438
)
3539
raise RuntimeError(error_message) from e
36-
return handler.detect_quote_convention()
40+
return handler.detect_quote_convention(include_chapters)
3741

3842
@abstractmethod
3943
def _exists(self, file_name: StrPath) -> bool: ...

machine/punctuation_analysis/quote_convention_detector.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from dataclasses import dataclass
2-
from typing import List, Optional
2+
from typing import Dict, List, Optional
33

44
from .chapter import Chapter
55
from .depth_based_quotation_mark_resolver import DepthBasedQuotationMarkResolver
@@ -51,8 +51,10 @@ def _count_quotation_marks_in_chapter(
5151

5252
self._quotation_mark_tabulator.tabulate(resolved_quotation_marks)
5353

54-
def detect_quote_convention(self) -> Optional[QuoteConventionAnalysis]:
55-
self._count_quotation_marks_in_chapters(self.get_chapters())
54+
def detect_quote_convention(
55+
self, include_chapters: Optional[Dict[int, List[int]]] = None
56+
) -> Optional[QuoteConventionAnalysis]:
57+
self._count_quotation_marks_in_chapters(self.get_chapters(include_chapters))
5658

5759
(best_quote_convention, score) = STANDARD_QUOTE_CONVENTIONS.find_most_similar_convention(
5860
self._quotation_mark_tabulator

machine/punctuation_analysis/text_segment.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
class TextSegment:
88
def __init__(self):
99
self._text = ""
10+
self.book: Optional[str] = None
11+
self.chapter: Optional[int] = None
1012
self._immediate_preceding_marker: UsfmMarkerType = UsfmMarkerType.NO_MARKER
1113
self._markers_in_preceding_context: Set[UsfmMarkerType] = set()
1214
self.previous_segment: Optional[TextSegment] = None
@@ -71,6 +73,14 @@ def add_preceding_marker(self, marker: UsfmMarkerType) -> "TextSegment.Builder":
7173
self._text_segment._markers_in_preceding_context.add(marker)
7274
return self
7375

76+
def set_book(self, code: str) -> "TextSegment.Builder":
77+
self._text_segment.book = code
78+
return self
79+
80+
def set_chapter(self, number: str) -> "TextSegment.Builder":
81+
self._text_segment.chapter = int(number)
82+
return self
83+
7484
def set_usfm_token(self, token: UsfmToken) -> "TextSegment.Builder":
7585
self._text_segment._usfm_token = token
7686
return self

machine/punctuation_analysis/usfm_structure_extractor.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
1-
from typing import Optional, Sequence
1+
from typing import Dict, List, Optional, Sequence
22

33
from ..corpora.usfm_parser_handler import UsfmParserHandler
44
from ..corpora.usfm_parser_state import UsfmParserState
55
from ..corpora.usfm_token import UsfmAttribute
6+
from ..scripture.canon import book_id_to_number
67
from .chapter import Chapter
78
from .text_segment import TextSegment
89
from .usfm_marker_type import UsfmMarkerType
@@ -14,6 +15,9 @@ def __init__(self):
1415
self._text_segments: list[TextSegment] = []
1516
self._next_text_segment_builder: TextSegment.Builder = TextSegment.Builder()
1617

18+
def start_book(self, state: UsfmParserState, marker: str, code: str) -> None:
19+
self._next_text_segment_builder.set_book(code)
20+
1721
def chapter(
1822
self,
1923
state: UsfmParserState,
@@ -23,6 +27,7 @@ def chapter(
2327
pub_number: Optional[str],
2428
) -> None:
2529
self._next_text_segment_builder.add_preceding_marker(UsfmMarkerType.CHAPTER)
30+
self._next_text_segment_builder.set_chapter(number)
2631

2732
def start_para(
2833
self,
@@ -79,11 +84,26 @@ def text(self, state: UsfmParserState, text: str) -> None:
7984
self._text_segments.append(text_segment)
8085
self._next_text_segment_builder = TextSegment.Builder()
8186

82-
def get_chapters(self) -> list[Chapter]:
87+
def get_chapters(self, include_chapters: Optional[Dict[int, List[int]]] = None) -> list[Chapter]:
8388
chapters: list[Chapter] = []
89+
current_book: int = 0
90+
current_chapter: int = 0
8491
current_chapter_verses: list[Verse] = []
8592
current_verse_segments: list[TextSegment] = []
8693
for text_segment in self._text_segments:
94+
if text_segment.book is not None:
95+
current_book = book_id_to_number(text_segment.book)
96+
if text_segment.chapter is not None:
97+
current_chapter = text_segment.chapter
98+
if include_chapters is not None and current_book > 0:
99+
if current_book not in include_chapters:
100+
continue
101+
elif (
102+
current_chapter > 0
103+
and len(include_chapters[current_book]) > 0
104+
and current_chapter not in include_chapters[current_book]
105+
):
106+
continue
87107
if text_segment.marker_is_in_preceding_context(UsfmMarkerType.VERSE):
88108
if len(current_verse_segments) > 0:
89109
current_chapter_verses.append(Verse(current_verse_segments))

tests/punctuation_analysis/test_paratext_project_quote_convention_detector.py

Lines changed: 68 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,27 @@
1-
from typing import Dict, Optional
1+
from typing import Dict, List, Optional
22

33
from testutils.memory_paratext_project_quote_convention_detector import MemoryParatextProjectQuoteConventionDetector
44

55
from machine.corpora import ParatextProjectSettings, UsfmStylesheet
66
from machine.punctuation_analysis import ParatextProjectQuoteConventionDetector, QuoteConventionAnalysis
7+
from machine.punctuation_analysis.quote_convention import QuoteConvention
8+
from machine.punctuation_analysis.standard_quote_conventions import STANDARD_QUOTE_CONVENTIONS
79
from machine.scripture import ORIGINAL_VERSIFICATION, Versification
10+
from machine.scripture.parse import parse_selection
11+
12+
standard_english_quote_convention: Optional[QuoteConvention] = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name(
13+
"standard_english"
14+
)
15+
standard_french_quote_convention: Optional[QuoteConvention] = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name(
16+
"standard_french"
17+
)
818

919

1020
def test_get_quote_convention() -> None:
1121
env = _TestEnvironment(
1222
files={
13-
"41MATTest.SFM": r"""\id MAT
14-
\c 1
15-
\v 1 Someone said, “This is something I am saying!
16-
\v 2 This is also something I am saying” (that is, “something I am speaking”).
17-
\p
18-
\v 3 Other text, and someone else said,
19-
\q1
20-
\v 4 “Things
21-
\q2 someone else said!
22-
\q3 and more things someone else said.”
23-
\m That is why he said “things someone else said.”
24-
\v 5 Then someone said, “More things someone said.”""",
23+
"41MATTest.SFM": rf"""\id MAT
24+
{get_test_chapter(1, standard_english_quote_convention)}""",
2525
}
2626
)
2727
analysis: Optional[QuoteConventionAnalysis] = env.get_quote_convention()
@@ -30,6 +30,40 @@ def test_get_quote_convention() -> None:
3030
assert analysis.best_quote_convention.name == "standard_english"
3131

3232

33+
def test_get_quote_convention_by_book() -> None:
34+
env = _TestEnvironment(
35+
files={
36+
"41MATTest.SFM": rf"""\id MAT
37+
{get_test_chapter(1, standard_english_quote_convention)}""",
38+
"42MRKTest.SFM": rf"""\id MRK
39+
{get_test_chapter(1, standard_french_quote_convention)}""",
40+
}
41+
)
42+
analysis: Optional[QuoteConventionAnalysis] = env.get_quote_convention("MRK")
43+
assert analysis is not None
44+
assert analysis.best_quote_convention_score > 0.8
45+
assert analysis.best_quote_convention.name == "standard_french"
46+
47+
48+
def test_get_quote_convention_by_chapter() -> None:
49+
env = _TestEnvironment(
50+
files={
51+
"41MATTest.SFM": rf"""\id MAT
52+
{get_test_chapter(1, standard_english_quote_convention)}""",
53+
"42MRKTest.SFM": rf"""\id MRK
54+
{get_test_chapter(1, standard_english_quote_convention)}
55+
{get_test_chapter(2, standard_french_quote_convention)}
56+
{get_test_chapter(3, standard_english_quote_convention)}
57+
{get_test_chapter(4, standard_english_quote_convention)}
58+
{get_test_chapter(5, standard_french_quote_convention)}""",
59+
}
60+
)
61+
analysis: Optional[QuoteConventionAnalysis] = env.get_quote_convention("MRK2,4-5")
62+
assert analysis is not None
63+
assert analysis.best_quote_convention_score > 0.66
64+
assert analysis.best_quote_convention.name == "standard_french"
65+
66+
3367
class _TestEnvironment:
3468
def __init__(
3569
self,
@@ -44,8 +78,27 @@ def __init__(
4478
def detector(self) -> ParatextProjectQuoteConventionDetector:
4579
return self._detector
4680

47-
def get_quote_convention(self) -> Optional[QuoteConventionAnalysis]:
48-
return self.detector.get_quote_convention_analysis()
81+
def get_quote_convention(self, scripture_range: Optional[str] = None) -> Optional[QuoteConventionAnalysis]:
82+
chapters: Optional[Dict[int, List[int]]] = None
83+
if scripture_range is not None:
84+
chapters = parse_selection(scripture_range, ORIGINAL_VERSIFICATION)
85+
return self.detector.get_quote_convention_analysis(include_chapters=chapters)
86+
87+
88+
def get_test_chapter(number: int, quote_convention: Optional[QuoteConvention]) -> str:
89+
left_quote: str = quote_convention.get_opening_quotation_mark_at_depth(1) if quote_convention else ""
90+
right_quote: str = quote_convention.get_closing_quotation_mark_at_depth(1) if quote_convention else ""
91+
return rf"""\c {number}
92+
\v 1 Someone said, {left_quote}This is something I am saying!
93+
\v 2 This is also something I am saying{right_quote} (that is, {left_quote}something I am speaking{right_quote}).
94+
\p
95+
\v 3 Other text, and someone else said,
96+
\q1
97+
\v 4 {left_quote}Things
98+
\q2 someone else said!
99+
\q3 and more things someone else said.{right_quote}
100+
\m That is why he said {left_quote}things someone else said.{right_quote}
101+
\v 5 Then someone said, {left_quote}More things someone said.{right_quote}"""
49102

50103

51104
class _DefaultParatextProjectSettings(ParatextProjectSettings):

0 commit comments

Comments
 (0)