Allow passing a selection of books and chapters to the PT quote detector (#236)

pmachapman · web-flow · commit 11d3611e2c72 · 2025-10-07T10:25:43.000+13:00
* Allow passing a selection of books and chapters to the PT quote detector

* Fix get_scripture_books to return DCs
diff --git a/machine/punctuation_analysis/paratext_project_quote_convention_detector.py b/machine/punctuation_analysis/paratext_project_quote_convention_detector.py
@@ -1,9 +1,10 @@
 from abc import ABC, abstractmethod
-from typing import BinaryIO, Optional, Union
+from typing import BinaryIO, Dict, List, Optional, Union
 
 from ..corpora.paratext_project_settings import ParatextProjectSettings
 from ..corpora.paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
 from ..corpora.usfm_parser import parse_usfm
+from ..scripture.canon import book_id_to_number, get_scripture_books
 from ..utils.typeshed import StrPath
 from .quote_convention_detector import QuoteConventionAnalysis, QuoteConventionDetector
 
@@ -16,10 +17,13 @@ def __init__(self, settings: Union[ParatextProjectSettings, ParatextProjectSetti
             self._settings = settings
 
     def get_quote_convention_analysis(
-        self, handler: Optional[QuoteConventionDetector] = None
+        self, handler: Optional[QuoteConventionDetector] = None, include_chapters: Optional[Dict[int, List[int]]] = None
     ) -> Optional[QuoteConventionAnalysis]:
         handler = QuoteConventionDetector() if handler is None else handler
-        for file_name in self._settings.get_all_scripture_book_file_names():
+        for book_id in get_scripture_books():
+            if include_chapters is not None and book_id_to_number(book_id) not in include_chapters:
+                continue
+            file_name: str = self._settings.get_book_file_name(book_id)
             if not self._exists(file_name):
                 continue
             with self._open(file_name) as sfm_file:
@@ -33,7 +37,7 @@ def get_quote_convention_analysis(
                     f". Error: '{e}'"
                 )
                 raise RuntimeError(error_message) from e
-        return handler.detect_quote_convention()
+        return handler.detect_quote_convention(include_chapters)
 
     @abstractmethod
     def _exists(self, file_name: StrPath) -> bool: ...
diff --git a/machine/punctuation_analysis/quote_convention_detector.py b/machine/punctuation_analysis/quote_convention_detector.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import List, Optional
+from typing import Dict, List, Optional
 
 from .chapter import Chapter
 from .depth_based_quotation_mark_resolver import DepthBasedQuotationMarkResolver
@@ -51,8 +51,10 @@ def _count_quotation_marks_in_chapter(
 
         self._quotation_mark_tabulator.tabulate(resolved_quotation_marks)
 
-    def detect_quote_convention(self) -> Optional[QuoteConventionAnalysis]:
-        self._count_quotation_marks_in_chapters(self.get_chapters())
+    def detect_quote_convention(
+        self, include_chapters: Optional[Dict[int, List[int]]] = None
+    ) -> Optional[QuoteConventionAnalysis]:
+        self._count_quotation_marks_in_chapters(self.get_chapters(include_chapters))
 
         (best_quote_convention, score) = STANDARD_QUOTE_CONVENTIONS.find_most_similar_convention(
             self._quotation_mark_tabulator
diff --git a/machine/punctuation_analysis/text_segment.py b/machine/punctuation_analysis/text_segment.py
@@ -7,6 +7,8 @@
 class TextSegment:
     def __init__(self):
         self._text = ""
+        self.book: Optional[str] = None
+        self.chapter: Optional[int] = None
         self._immediate_preceding_marker: UsfmMarkerType = UsfmMarkerType.NO_MARKER
         self._markers_in_preceding_context: Set[UsfmMarkerType] = set()
         self.previous_segment: Optional[TextSegment] = None
@@ -71,6 +73,14 @@ def add_preceding_marker(self, marker: UsfmMarkerType) -> "TextSegment.Builder":
             self._text_segment._markers_in_preceding_context.add(marker)
             return self
 
+        def set_book(self, code: str) -> "TextSegment.Builder":
+            self._text_segment.book = code
+            return self
+
+        def set_chapter(self, number: int) -> "TextSegment.Builder":
+            self._text_segment.chapter = number
+            return self
+
         def set_usfm_token(self, token: UsfmToken) -> "TextSegment.Builder":
             self._text_segment._usfm_token = token
             return self
diff --git a/machine/punctuation_analysis/usfm_structure_extractor.py b/machine/punctuation_analysis/usfm_structure_extractor.py
@@ -1,8 +1,9 @@
-from typing import Optional, Sequence
+from typing import Dict, List, Optional, Sequence
 
 from ..corpora.usfm_parser_handler import UsfmParserHandler
 from ..corpora.usfm_parser_state import UsfmParserState
 from ..corpora.usfm_token import UsfmAttribute
+from ..scripture.canon import book_id_to_number
 from .chapter import Chapter
 from .text_segment import TextSegment
 from .usfm_marker_type import UsfmMarkerType
@@ -14,6 +15,9 @@ def __init__(self):
         self._text_segments: list[TextSegment] = []
         self._next_text_segment_builder: TextSegment.Builder = TextSegment.Builder()
 
+    def start_book(self, state: UsfmParserState, marker: str, code: str) -> None:
+        self._next_text_segment_builder.set_book(code)
+
     def chapter(
         self,
         state: UsfmParserState,
@@ -23,6 +27,8 @@ def chapter(
         pub_number: Optional[str],
     ) -> None:
         self._next_text_segment_builder.add_preceding_marker(UsfmMarkerType.CHAPTER)
+        chapter_number: int = int(number) if number.isdigit() else 0
+        self._next_text_segment_builder.set_chapter(chapter_number)
 
     def start_para(
         self,
@@ -79,11 +85,26 @@ def text(self, state: UsfmParserState, text: str) -> None:
             self._text_segments.append(text_segment)
         self._next_text_segment_builder = TextSegment.Builder()
 
-    def get_chapters(self) -> list[Chapter]:
+    def get_chapters(self, include_chapters: Optional[Dict[int, List[int]]] = None) -> list[Chapter]:
         chapters: list[Chapter] = []
+        current_book: int = 0
+        current_chapter: int = 0
         current_chapter_verses: list[Verse] = []
         current_verse_segments: list[TextSegment] = []
         for text_segment in self._text_segments:
+            if text_segment.book is not None:
+                current_book = book_id_to_number(text_segment.book)
+            if text_segment.chapter is not None:
+                current_chapter = text_segment.chapter
+            if include_chapters is not None and current_book > 0:
+                if current_book not in include_chapters:
+                    continue
+                elif (
+                    current_chapter > 0
+                    and len(include_chapters[current_book]) > 0
+                    and current_chapter not in include_chapters[current_book]
+                ):
+                    continue
             if text_segment.marker_is_in_preceding_context(UsfmMarkerType.VERSE):
                 if len(current_verse_segments) > 0:
                     current_chapter_verses.append(Verse(current_verse_segments))
diff --git a/machine/scripture/canon.py b/machine/scripture/canon.py
@@ -184,4 +184,4 @@ def is_canonical(book: Union[str, int]) -> bool:
 
 
 def get_scripture_books() -> Iterable[str]:
-    return list(map(lambda kvp: kvp[0], filter(lambda kvp: is_ot_nt(kvp[1]), BOOK_NUMBERS.items())))
+    return list(map(lambda kvp: kvp[0], filter(lambda kvp: is_canonical(kvp[1]), BOOK_NUMBERS.items())))
diff --git a/tests/punctuation_analysis/test_paratext_project_quote_convention_detector.py b/tests/punctuation_analysis/test_paratext_project_quote_convention_detector.py
@@ -1,27 +1,29 @@
-from typing import Dict, Optional
+from typing import Dict, List, Optional
 
 from testutils.memory_paratext_project_quote_convention_detector import MemoryParatextProjectQuoteConventionDetector
 
 from machine.corpora import ParatextProjectSettings, UsfmStylesheet
-from machine.punctuation_analysis import ParatextProjectQuoteConventionDetector, QuoteConventionAnalysis
-from machine.scripture import ORIGINAL_VERSIFICATION, Versification
+from machine.punctuation_analysis import (
+    STANDARD_QUOTE_CONVENTIONS,
+    ParatextProjectQuoteConventionDetector,
+    QuoteConvention,
+    QuoteConventionAnalysis,
+)
+from machine.scripture import ORIGINAL_VERSIFICATION, Versification, get_chapters
+
+standard_english_quote_convention: Optional[QuoteConvention] = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name(
+    "standard_english"
+)
+standard_french_quote_convention: Optional[QuoteConvention] = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name(
+    "standard_french"
+)
 
 
 def test_get_quote_convention() -> None:
     env = _TestEnvironment(
         files={
-            "41MATTest.SFM": r"""\id MAT
-\c 1
-\v 1 Someone said, “This is something I am saying!
-\v 2 This is also something I am saying” (that is, “something I am speaking”).
-\p
-\v 3 Other text, and someone else said,
-\q1
-\v 4 “Things
-\q2 someone else said!
-\q3 and more things someone else said.”
-\m That is why he said “things someone else said.”
-\v 5 Then someone said, “More things someone said.”""",
+            "41MATTest.SFM": rf"""\id MAT
+{get_test_chapter(1, standard_english_quote_convention)}""",
         }
     )
     analysis: Optional[QuoteConventionAnalysis] = env.get_quote_convention()
@@ -30,6 +32,64 @@ def test_get_quote_convention() -> None:
     assert analysis.best_quote_convention.name == "standard_english"
 
 
+def test_get_quote_convention_by_book() -> None:
+    env = _TestEnvironment(
+        files={
+            "41MATTest.SFM": rf"""\id MAT
+{get_test_chapter(1, standard_english_quote_convention)}""",
+            "42MRKTest.SFM": rf"""\id MRK
+{get_test_chapter(1, standard_french_quote_convention)}""",
+        }
+    )
+    analysis: Optional[QuoteConventionAnalysis] = env.get_quote_convention("MRK")
+    assert analysis is not None
+    assert analysis.best_quote_convention_score > 0.8
+    assert analysis.best_quote_convention.name == "standard_french"
+
+
+def test_get_quote_convention_by_chapter() -> None:
+    env = _TestEnvironment(
+        files={
+            "41MATTest.SFM": rf"""\id MAT
+{get_test_chapter(1, standard_english_quote_convention)}""",
+            "42MRKTest.SFM": rf"""\id MRK
+{get_test_chapter(1, standard_english_quote_convention)}
+{get_test_chapter(2, standard_french_quote_convention)}
+{get_test_chapter(3, standard_english_quote_convention)}
+{get_test_chapter(4, standard_english_quote_convention)}
+{get_test_chapter(5, standard_french_quote_convention)}""",
+        }
+    )
+    analysis: Optional[QuoteConventionAnalysis] = env.get_quote_convention("MRK2,4-5")
+    assert analysis is not None
+    assert analysis.best_quote_convention_score > 0.66
+    assert analysis.best_quote_convention.name == "standard_french"
+
+
+def test_get_quote_convention_by_chapter_indeterminate() -> None:
+    env = _TestEnvironment(
+        files={
+            "41MATTest.SFM": rf"""\id MAT
+{get_test_chapter(1, None)}
+{get_test_chapter(2, standard_english_quote_convention)}
+{get_test_chapter(3, None)}""",
+        }
+    )
+    analysis: Optional[QuoteConventionAnalysis] = env.get_quote_convention("MAT1,3")
+    assert analysis is None
+
+
+def test_get_quote_convention_invalid_book_code() -> None:
+    env = _TestEnvironment(
+        files={
+            "41MATTest.SFM": rf"""\id LUK
+{get_test_chapter(1, standard_english_quote_convention)}""",
+        }
+    )
+    analysis: Optional[QuoteConventionAnalysis] = env.get_quote_convention("MAT")
+    assert analysis is None
+
+
 class _TestEnvironment:
     def __init__(
         self,
@@ -44,8 +104,27 @@ def __init__(
     def detector(self) -> ParatextProjectQuoteConventionDetector:
         return self._detector
 
-    def get_quote_convention(self) -> Optional[QuoteConventionAnalysis]:
-        return self.detector.get_quote_convention_analysis()
+    def get_quote_convention(self, scripture_range: Optional[str] = None) -> Optional[QuoteConventionAnalysis]:
+        chapters: Optional[Dict[int, List[int]]] = None
+        if scripture_range is not None:
+            chapters = get_chapters(scripture_range, ORIGINAL_VERSIFICATION)
+        return self.detector.get_quote_convention_analysis(include_chapters=chapters)
+
+
+def get_test_chapter(number: int, quote_convention: Optional[QuoteConvention]) -> str:
+    left_quote: str = quote_convention.get_opening_quotation_mark_at_depth(1) if quote_convention else ""
+    right_quote: str = quote_convention.get_closing_quotation_mark_at_depth(1) if quote_convention else ""
+    return rf"""\c {number}
+\v 1 Someone said, {left_quote}This is something I am saying!
+\v 2 This is also something I am saying{right_quote} (that is, {left_quote}something I am speaking{right_quote}).
+\p
+\v 3 Other text, and someone else said,
+\q1
+\v 4 {left_quote}Things
+\q2 someone else said!
+\q3 and more things someone else said.{right_quote}
+\m That is why he said {left_quote}things someone else said.{right_quote}
+\v 5 Then someone said, {left_quote}More things someone said.{right_quote}"""
 
 
 class _DefaultParatextProjectSettings(ParatextProjectSettings):
diff --git a/tests/punctuation_analysis/test_usfm_structure_extractor.py b/tests/punctuation_analysis/test_usfm_structure_extractor.py
@@ -7,6 +7,52 @@
 verse_text_parser_state.verse_ref.verse_num = 1
 
 
+def test_get_chapters_filter_by_book():
+    usfm_structure_extractor = UsfmStructureExtractor()
+    usfm_structure_extractor.start_book(verse_text_parser_state, "id", "GEN")
+    usfm_structure_extractor.chapter(verse_text_parser_state, "1", "c", None, None)
+    usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None)
+    usfm_structure_extractor.text(verse_text_parser_state, "test")
+
+    actual_chapters = usfm_structure_extractor.get_chapters({2: [1]})  # EXO 1
+    assert len(actual_chapters) == 0
+
+
+def test_get_chapters_filter_by_chapter():
+    usfm_structure_extractor = UsfmStructureExtractor()
+    usfm_structure_extractor.start_book(verse_text_parser_state, "id", "MAT")
+    usfm_structure_extractor.chapter(verse_text_parser_state, "1", "c", None, None)
+    usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None)
+    usfm_structure_extractor.text(verse_text_parser_state, "test")
+    usfm_structure_extractor.chapter(verse_text_parser_state, "2", "c", None, None)
+    usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None)
+    usfm_structure_extractor.text(verse_text_parser_state, "test2")
+    usfm_structure_extractor.chapter(verse_text_parser_state, "3", "c", None, None)
+    usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None)
+    usfm_structure_extractor.text(verse_text_parser_state, "test3")
+
+    expected_chapters = [
+        Chapter(
+            [
+                Verse(
+                    [
+                        TextSegment.Builder()
+                        .set_text("test2")
+                        .add_preceding_marker(UsfmMarkerType.CHAPTER)
+                        .add_preceding_marker(UsfmMarkerType.VERSE)
+                        .build()
+                    ]
+                )
+            ]
+        )
+    ]
+
+    actual_chapters = usfm_structure_extractor.get_chapters({40: [2]})
+    assert_chapter_equal(expected_chapters, actual_chapters)
+    assert actual_chapters[0].verses[0]._text_segments[0].previous_segment is None
+    assert actual_chapters[0].verses[0]._text_segments[0].next_segment is None
+
+
 def test_chapter_and_verse_markers():
     usfm_structure_extractor = UsfmStructureExtractor()
     usfm_structure_extractor.chapter(verse_text_parser_state, "1", "c", None, None)

Original file line number	Diff line number	Diff line change
`@@ -184,4 +184,4 @@ def is_canonical(book: Union[str, int]) -> bool:`
`184`	`184`
`185`	`185`
`186`	`186`	`def get_scripture_books() -> Iterable[str]:`
`187`		`- return list(map(lambda kvp: kvp[0], filter(lambda kvp: is_ot_nt(kvp[1]), BOOK_NUMBERS.items())))`
	`187`	`+ return list(map(lambda kvp: kvp[0], filter(lambda kvp: is_canonical(kvp[1]), BOOK_NUMBERS.items())))`