Allow passing a selection of books and chapters to the PT quote detector

pmachapman · pmachapman · commit 8fdb2730aad1 · 2025-10-02T16:10:13.000+13:00
diff --git a/machine/punctuation_analysis/paratext_project_quote_convention_detector.py b/machine/punctuation_analysis/paratext_project_quote_convention_detector.py
@@ -1,9 +1,10 @@
 from abc import ABC, abstractmethod
-from typing import BinaryIO, Optional, Union
+from typing import BinaryIO, Dict, List, Optional, Union
 
 from ..corpora.paratext_project_settings import ParatextProjectSettings
 from ..corpora.paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
 from ..corpora.usfm_parser import parse_usfm
+from ..scripture.canon import book_id_to_number, get_scripture_books
 from ..utils.typeshed import StrPath
 from .quote_convention_detector import QuoteConventionAnalysis, QuoteConventionDetector
 
@@ -16,10 +17,13 @@ def __init__(self, settings: Union[ParatextProjectSettings, ParatextProjectSetti
             self._settings = settings
 
     def get_quote_convention_analysis(
-        self, handler: Optional[QuoteConventionDetector] = None
+        self, handler: Optional[QuoteConventionDetector] = None, include_chapters: Optional[Dict[int, List[int]]] = None
     ) -> Optional[QuoteConventionAnalysis]:
         handler = QuoteConventionDetector() if handler is None else handler
-        for file_name in self._settings.get_all_scripture_book_file_names():
+        for book_id in get_scripture_books():
+            if include_chapters is not None and book_id_to_number(book_id) not in include_chapters:
+                continue
+            file_name: str = self._settings.get_book_file_name(book_id)
             if not self._exists(file_name):
                 continue
             with self._open(file_name) as sfm_file:
@@ -33,7 +37,7 @@ def get_quote_convention_analysis(
                     f". Error: '{e}'"
                 )
                 raise RuntimeError(error_message) from e
-        return handler.detect_quote_convention()
+        return handler.detect_quote_convention(include_chapters)
 
     @abstractmethod
     def _exists(self, file_name: StrPath) -> bool: ...
diff --git a/machine/punctuation_analysis/quote_convention_detector.py b/machine/punctuation_analysis/quote_convention_detector.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import List, Optional
+from typing import Dict, List, Optional
 
 from .chapter import Chapter
 from .depth_based_quotation_mark_resolver import DepthBasedQuotationMarkResolver
@@ -51,8 +51,10 @@ def _count_quotation_marks_in_chapter(
 
         self._quotation_mark_tabulator.tabulate(resolved_quotation_marks)
 
-    def detect_quote_convention(self) -> Optional[QuoteConventionAnalysis]:
-        self._count_quotation_marks_in_chapters(self.get_chapters())
+    def detect_quote_convention(
+        self, include_chapters: Optional[Dict[int, List[int]]] = None
+    ) -> Optional[QuoteConventionAnalysis]:
+        self._count_quotation_marks_in_chapters(self.get_chapters(include_chapters))
 
         (best_quote_convention, score) = STANDARD_QUOTE_CONVENTIONS.find_most_similar_convention(
             self._quotation_mark_tabulator
diff --git a/machine/punctuation_analysis/text_segment.py b/machine/punctuation_analysis/text_segment.py
@@ -7,6 +7,8 @@
 class TextSegment:
     def __init__(self):
         self._text = ""
+        self.book: Optional[str] = None
+        self.chapter: Optional[int] = None
         self._immediate_preceding_marker: UsfmMarkerType = UsfmMarkerType.NO_MARKER
         self._markers_in_preceding_context: Set[UsfmMarkerType] = set()
         self.previous_segment: Optional[TextSegment] = None
@@ -71,6 +73,14 @@ def add_preceding_marker(self, marker: UsfmMarkerType) -> "TextSegment.Builder":
             self._text_segment._markers_in_preceding_context.add(marker)
             return self
 
+        def set_book(self, code: str) -> "TextSegment.Builder":
+            self._text_segment.book = code
+            return self
+
+        def set_chapter(self, number: str) -> "TextSegment.Builder":
+            self._text_segment.chapter = int(number)
+            return self
+
         def set_usfm_token(self, token: UsfmToken) -> "TextSegment.Builder":
             self._text_segment._usfm_token = token
             return self
diff --git a/machine/punctuation_analysis/usfm_structure_extractor.py b/machine/punctuation_analysis/usfm_structure_extractor.py
@@ -1,8 +1,9 @@
-from typing import Optional, Sequence
+from typing import Dict, List, Optional, Sequence
 
 from ..corpora.usfm_parser_handler import UsfmParserHandler
 from ..corpora.usfm_parser_state import UsfmParserState
 from ..corpora.usfm_token import UsfmAttribute
+from ..scripture.canon import book_id_to_number
 from .chapter import Chapter
 from .text_segment import TextSegment
 from .usfm_marker_type import UsfmMarkerType
@@ -14,6 +15,9 @@ def __init__(self):
         self._text_segments: list[TextSegment] = []
         self._next_text_segment_builder: TextSegment.Builder = TextSegment.Builder()
 
+    def start_book(self, state: UsfmParserState, marker: str, code: str) -> None:
+        self._next_text_segment_builder.set_book(code)
+
     def chapter(
         self,
         state: UsfmParserState,
@@ -23,6 +27,7 @@ def chapter(
         pub_number: Optional[str],
     ) -> None:
         self._next_text_segment_builder.add_preceding_marker(UsfmMarkerType.CHAPTER)
+        self._next_text_segment_builder.set_chapter(number)
 
     def start_para(
         self,
@@ -79,11 +84,26 @@ def text(self, state: UsfmParserState, text: str) -> None:
             self._text_segments.append(text_segment)
         self._next_text_segment_builder = TextSegment.Builder()
 
-    def get_chapters(self) -> list[Chapter]:
+    def get_chapters(self, include_chapters: Optional[Dict[int, List[int]]] = None) -> list[Chapter]:
         chapters: list[Chapter] = []
+        current_book: int = 0
+        current_chapter: int = 0
         current_chapter_verses: list[Verse] = []
         current_verse_segments: list[TextSegment] = []
         for text_segment in self._text_segments:
+            if text_segment.book is not None:
+                current_book = book_id_to_number(text_segment.book)
+            if text_segment.chapter is not None:
+                current_chapter = text_segment.chapter
+            if include_chapters is not None and current_book > 0:
+                if current_book not in include_chapters:
+                    continue
+                elif (
+                    current_chapter > 0
+                    and len(include_chapters[current_book]) > 0
+                    and current_chapter not in include_chapters[current_book]
+                ):
+                    continue
             if text_segment.marker_is_in_preceding_context(UsfmMarkerType.VERSE):
                 if len(current_verse_segments) > 0:
                     current_chapter_verses.append(Verse(current_verse_segments))
diff --git a/tests/punctuation_analysis/test_paratext_project_quote_convention_detector.py b/tests/punctuation_analysis/test_paratext_project_quote_convention_detector.py
@@ -1,27 +1,27 @@
-from typing import Dict, Optional
+from typing import Dict, List, Optional
 
 from testutils.memory_paratext_project_quote_convention_detector import MemoryParatextProjectQuoteConventionDetector
 
 from machine.corpora import ParatextProjectSettings, UsfmStylesheet
 from machine.punctuation_analysis import ParatextProjectQuoteConventionDetector, QuoteConventionAnalysis
+from machine.punctuation_analysis.quote_convention import QuoteConvention
+from machine.punctuation_analysis.standard_quote_conventions import STANDARD_QUOTE_CONVENTIONS
 from machine.scripture import ORIGINAL_VERSIFICATION, Versification
+from machine.scripture.parse import parse_selection
+
+standard_english_quote_convention: Optional[QuoteConvention] = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name(
+    "standard_english"
+)
+standard_french_quote_convention: Optional[QuoteConvention] = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name(
+    "standard_french"
+)
 
 
 def test_get_quote_convention() -> None:
     env = _TestEnvironment(
         files={
-            "41MATTest.SFM": r"""\id MAT
-\c 1
-\v 1 Someone said, “This is something I am saying!
-\v 2 This is also something I am saying” (that is, “something I am speaking”).
-\p
-\v 3 Other text, and someone else said,
-\q1
-\v 4 “Things
-\q2 someone else said!
-\q3 and more things someone else said.”
-\m That is why he said “things someone else said.”
-\v 5 Then someone said, “More things someone said.”""",
+            "41MATTest.SFM": rf"""\id MAT
+{get_test_chapter(1, standard_english_quote_convention)}""",
         }
     )
     analysis: Optional[QuoteConventionAnalysis] = env.get_quote_convention()
@@ -30,6 +30,40 @@ def test_get_quote_convention() -> None:
     assert analysis.best_quote_convention.name == "standard_english"
 
 
+def test_get_quote_convention_by_book() -> None:
+    env = _TestEnvironment(
+        files={
+            "41MATTest.SFM": rf"""\id MAT
+{get_test_chapter(1, standard_english_quote_convention)}""",
+            "42MRKTest.SFM": rf"""\id MRK
+{get_test_chapter(1, standard_french_quote_convention)}""",
+        }
+    )
+    analysis: Optional[QuoteConventionAnalysis] = env.get_quote_convention("MRK")
+    assert analysis is not None
+    assert analysis.best_quote_convention_score > 0.8
+    assert analysis.best_quote_convention.name == "standard_french"
+
+
+def test_get_quote_convention_by_chapter() -> None:
+    env = _TestEnvironment(
+        files={
+            "41MATTest.SFM": rf"""\id MAT
+{get_test_chapter(1, standard_english_quote_convention)}""",
+            "42MRKTest.SFM": rf"""\id MRK
+{get_test_chapter(1, standard_english_quote_convention)}
+{get_test_chapter(2, standard_french_quote_convention)}
+{get_test_chapter(3, standard_english_quote_convention)}
+{get_test_chapter(4, standard_english_quote_convention)}
+{get_test_chapter(5, standard_french_quote_convention)}""",
+        }
+    )
+    analysis: Optional[QuoteConventionAnalysis] = env.get_quote_convention("MRK2,4-5")
+    assert analysis is not None
+    assert analysis.best_quote_convention_score > 0.66
+    assert analysis.best_quote_convention.name == "standard_french"
+
+
 class _TestEnvironment:
     def __init__(
         self,
@@ -44,8 +78,27 @@ def __init__(
     def detector(self) -> ParatextProjectQuoteConventionDetector:
         return self._detector
 
-    def get_quote_convention(self) -> Optional[QuoteConventionAnalysis]:
-        return self.detector.get_quote_convention_analysis()
+    def get_quote_convention(self, scripture_range: Optional[str] = None) -> Optional[QuoteConventionAnalysis]:
+        chapters: Optional[Dict[int, List[int]]] = None
+        if scripture_range is not None:
+            chapters = parse_selection(scripture_range, ORIGINAL_VERSIFICATION)
+        return self.detector.get_quote_convention_analysis(include_chapters=chapters)
+
+
+def get_test_chapter(number: int, quote_convention: Optional[QuoteConvention]) -> str:
+    left_quote: str = quote_convention.get_opening_quotation_mark_at_depth(1) if quote_convention else ""
+    right_quote: str = quote_convention.get_closing_quotation_mark_at_depth(1) if quote_convention else ""
+    return rf"""\c {number}
+\v 1 Someone said, {left_quote}This is something I am saying!
+\v 2 This is also something I am saying{right_quote} (that is, {left_quote}something I am speaking{right_quote}).
+\p
+\v 3 Other text, and someone else said,
+\q1
+\v 4 {left_quote}Things
+\q2 someone else said!
+\q3 and more things someone else said.{right_quote}
+\m That is why he said {left_quote}things someone else said.{right_quote}
+\v 5 Then someone said, {left_quote}More things someone said.{right_quote}"""
 
 
 class _DefaultParatextProjectSettings(ParatextProjectSettings):