Improved quote convention detection for Paratext projects (#239)

benjaminking · Ben King · web-flow · commit 0c3cd9c92805 · 2025-11-10T16:14:20.000-05:00
* Use a weighted average of books for Paratext project quote convention detection

* Always return a QuoteConventionAnalysis instead of None

* Modify quote convention similarity calculation

* Add new quote convention

* Minor code clarity changes

* Fix linting issue

* Address reviewer comments + refactor weighted average

---------

Co-authored-by: Ben King &lt;benjaminking@sil.org&gt;
diff --git a/machine/punctuation_analysis/__init__.py b/machine/punctuation_analysis/__init__.py
@@ -32,9 +32,10 @@
 from .quotation_mark_update_settings import QuotationMarkUpdateSettings
 from .quotation_mark_update_strategy import QuotationMarkUpdateStrategy
 from .quote_convention import QuoteConvention, SingleLevelQuoteConvention
+from .quote_convention_analysis import QuoteConventionAnalysis
 from .quote_convention_changing_usfm_update_block_handler import QuoteConventionChangingUsfmUpdateBlockHandler
 from .quote_convention_detection_resolution_settings import QuoteConventionDetectionResolutionSettings
-from .quote_convention_detector import QuoteConventionAnalysis, QuoteConventionDetector
+from .quote_convention_detector import QuoteConventionDetector
 from .quote_convention_set import QuoteConventionSet
 from .standard_quote_conventions import STANDARD_QUOTE_CONVENTIONS
 from .text_segment import TextSegment
diff --git a/machine/punctuation_analysis/paratext_project_quote_convention_detector.py b/machine/punctuation_analysis/paratext_project_quote_convention_detector.py
@@ -6,7 +6,8 @@
 from ..corpora.usfm_parser import parse_usfm
 from ..scripture.canon import book_id_to_number, get_scripture_books
 from ..utils.typeshed import StrPath
-from .quote_convention_detector import QuoteConventionAnalysis, QuoteConventionDetector
+from .quote_convention_analysis import QuoteConventionAnalysis
+from .quote_convention_detector import QuoteConventionDetector
 
 
 class ParatextProjectQuoteConventionDetector(ABC):
@@ -17,15 +18,20 @@ def __init__(self, settings: Union[ParatextProjectSettings, ParatextProjectSetti
             self._settings = settings
 
     def get_quote_convention_analysis(
-        self, handler: Optional[QuoteConventionDetector] = None, include_chapters: Optional[Dict[int, List[int]]] = None
-    ) -> Optional[QuoteConventionAnalysis]:
-        handler = QuoteConventionDetector() if handler is None else handler
+        self, include_chapters: Optional[Dict[int, List[int]]] = None
+    ) -> QuoteConventionAnalysis:
+
+        book_quote_convention_analyses: List[QuoteConventionAnalysis] = []
+
         for book_id in get_scripture_books():
             if include_chapters is not None and book_id_to_number(book_id) not in include_chapters:
                 continue
             file_name: str = self._settings.get_book_file_name(book_id)
             if not self._exists(file_name):
                 continue
+
+            handler = QuoteConventionDetector()
+
             with self._open(file_name) as sfm_file:
                 usfm: str = sfm_file.read().decode(self._settings.encoding)
             try:
@@ -37,7 +43,11 @@ def get_quote_convention_analysis(
                     f". Error: '{e}'"
                 )
                 raise RuntimeError(error_message) from e
-        return handler.detect_quote_convention(include_chapters)
+
+            quote_convention_analysis = handler.detect_quote_convention(include_chapters)
+            book_quote_convention_analyses.append(quote_convention_analysis)
+
+        return QuoteConventionAnalysis.combine_with_weighted_average(book_quote_convention_analyses)
 
     @abstractmethod
     def _exists(self, file_name: StrPath) -> bool: ...
diff --git a/machine/punctuation_analysis/quotation_mark_tabulator.py b/machine/punctuation_analysis/quotation_mark_tabulator.py
@@ -1,5 +1,5 @@
 from collections import Counter, defaultdict
-from typing import List
+from typing import Dict, List
 
 from .quotation_mark_direction import QuotationMarkDirection
 from .quotation_mark_metadata import QuotationMarkMetadata
@@ -15,6 +15,10 @@ def count_quotation_mark(self, quotation_mark: str) -> None:
         self._quotation_mark_counter.update([quotation_mark])
         self._total_count += 1
 
+    def count_from(self, quotation_mark_counts: "QuotationMarkCounts") -> None:
+        self._quotation_mark_counter.update(quotation_mark_counts._quotation_mark_counter)
+        self._total_count += quotation_mark_counts._total_count
+
     def find_best_quotation_mark_proportion(self) -> tuple[str, int, int]:
         return self._quotation_mark_counter.most_common(1)[0] + (self._total_count,)
 
@@ -36,6 +40,13 @@ def tabulate(self, quotation_marks: list[QuotationMarkMetadata]) -> None:
         for quotation_mark in quotation_marks:
             self._count_quotation_mark(quotation_mark)
 
+    def tabulate_from(self, tabulated_quotation_marks: "QuotationMarkTabulator") -> None:
+        for (
+            depth_and_direction,
+            quotation_mark_counts,
+        ) in tabulated_quotation_marks._quotation_counts_by_depth_and_direction.items():
+            self._quotation_counts_by_depth_and_direction[depth_and_direction].count_from(quotation_mark_counts)
+
     def _count_quotation_mark(self, quotation_mark: QuotationMarkMetadata) -> None:
         key = (quotation_mark.depth, quotation_mark.direction)
         self._quotation_counts_by_depth_and_direction[key].count_quotation_mark(quotation_mark.quotation_mark)
@@ -48,23 +59,39 @@ def _find_most_common_quotation_mark_with_depth_and_direction(
     ) -> tuple[str, int, int]:
         return self._quotation_counts_by_depth_and_direction[(depth, direction)].find_best_quotation_mark_proportion()
 
+    def get_total_quotation_mark_count(self) -> int:
+        total_count = 0
+        for counts in self._quotation_counts_by_depth_and_direction.values():
+            total_count += counts.get_observed_count()
+        return total_count
+
     def calculate_similarity(self, quote_convention: QuoteConvention) -> float:
-        weighted_difference = 0
-        total_weight = 0
-        for depth, direction in self._quotation_counts_by_depth_and_direction:
+        num_marks_by_depth: Dict[int, int] = defaultdict(int)
+        num_matching_marks_by_depth: Dict[int, int] = defaultdict(int)
+
+        for depth, direction in sorted(self._quotation_counts_by_depth_and_direction, key=lambda item: item[0]):
             expected_quotation_mark: str = quote_convention.get_expected_quotation_mark(depth, direction)
 
-            # Give higher weight to shallower depths, since deeper marks are more likely to be mistakes
-            weighted_difference += self._quotation_counts_by_depth_and_direction[
+            num_matching_marks = self._quotation_counts_by_depth_and_direction[(depth, direction)].get_observed_count()
+            num_marks_by_depth[depth] += num_matching_marks
+            num_matching_marks_by_depth[depth] += num_matching_marks - self._quotation_counts_by_depth_and_direction[
                 (depth, direction)
-            ].calculate_num_differences(expected_quotation_mark) * 2 ** (-depth)
-            total_weight += self._quotation_counts_by_depth_and_direction[
-                (depth, direction)
-            ].get_observed_count() * 2 ** (-depth)
+            ].calculate_num_differences(expected_quotation_mark)
+
+        # The scores of greater depths depend on the scores of shallower depths
+        scores_by_depth: Dict[int, float] = defaultdict(float)
+        for depth in sorted(num_marks_by_depth.keys()):
+            previous_depth_score = (
+                scores_by_depth[depth - 1] / num_marks_by_depth[depth - 1] if depth - 1 in scores_by_depth else 1
+            )
+            scores_by_depth[depth] = previous_depth_score * num_matching_marks_by_depth[depth]
+
+        total_marks = sum(num_marks_by_depth.values())
+        total_score = sum(scores_by_depth.values())
 
-        if total_weight == 0:
+        if total_marks == 0:
             return 0
-        return 1 - (weighted_difference / total_weight)
+        return total_score / total_marks
 
     def get_summary_message(self) -> str:
         message_lines: List[str] = []
diff --git a/machine/punctuation_analysis/quote_convention.py b/machine/punctuation_analysis/quote_convention.py
@@ -37,6 +37,9 @@ def normalize(self) -> "SingleLevelQuoteConvention":
         )
         return SingleLevelQuoteConvention(normalized_opening_quotation_mark, normalized_closing_quotation_mark)
 
+    def __hash__(self) -> int:
+        return hash((self.opening_quotation_mark, self.closing_quotation_mark))
+
 
 class QuoteConvention:
     def __init__(self, name: str, level_conventions: list[SingleLevelQuoteConvention]):
@@ -57,6 +60,9 @@ def __eq__(self, value):
                 return False
         return True
 
+    def __hash__(self) -> int:
+        return hash(tuple(self.level_conventions))
+
     @property
     def name(self) -> str:
         return self._name
diff --git a/machine/punctuation_analysis/quote_convention_analysis.py b/machine/punctuation_analysis/quote_convention_analysis.py
@@ -0,0 +1,79 @@
+from collections import defaultdict
+from typing import Dict, List, Optional
+
+from .quotation_mark_tabulator import QuotationMarkTabulator
+from .quote_convention import QuoteConvention
+
+
+class QuoteConventionAnalysis:
+
+    def __init__(
+        self,
+        convention_scores: dict[QuoteConvention, float],
+        tabulated_quotation_marks: QuotationMarkTabulator,
+        analysis_weight: float = 1.0,  # weight is used for combining scores for multiple books
+    ):
+        self._convention_scores = convention_scores
+        if len(convention_scores) > 0:
+            (self._best_quote_convention, self._best_quote_convention_score) = max(
+                convention_scores.items(), key=lambda item: item[1]
+            )
+        else:
+            self._best_quote_convention_score = 0
+            self._best_quote_convention = None
+
+        self._tabulated_quotation_marks = tabulated_quotation_marks
+        self._analysis_weight = analysis_weight
+
+    @property
+    def analysis_summary(self) -> str:
+        return self._tabulated_quotation_marks.get_summary_message()
+
+    @property
+    def best_quote_convention(self) -> Optional[QuoteConvention]:
+        return self._best_quote_convention
+
+    @property
+    def best_quote_convention_score(self) -> float:
+        return self._best_quote_convention_score
+
+    class Builder:
+        def __init__(self, tabulated_quotation_marks: QuotationMarkTabulator):
+            self._convention_scores: dict[QuoteConvention, float] = {}
+            self._tabulated_quotation_marks = tabulated_quotation_marks
+
+        def record_convention_score(self, quote_convention: QuoteConvention, score: float) -> None:
+            self._convention_scores[quote_convention] = score
+
+        def build(self) -> "QuoteConventionAnalysis":
+            return QuoteConventionAnalysis(
+                self._convention_scores,
+                self._tabulated_quotation_marks,
+                self._tabulated_quotation_marks.get_total_quotation_mark_count(),
+            )
+
+    @staticmethod
+    def combine_with_weighted_average(
+        quote_convention_analyses: List["QuoteConventionAnalysis"],
+    ) -> "QuoteConventionAnalysis":
+        total_weight: float = 0
+        convention_votes: Dict[str, float] = defaultdict(float)
+        quote_conventions_by_name: Dict[str, QuoteConvention] = {}
+        total_tabulated_quotation_marks = QuotationMarkTabulator()
+        for quote_convention_analysis in quote_convention_analyses:
+            total_tabulated_quotation_marks.tabulate_from(quote_convention_analysis._tabulated_quotation_marks)
+            total_weight += quote_convention_analysis._analysis_weight
+            for convention, score in quote_convention_analysis._convention_scores.items():
+                if convention.name not in quote_conventions_by_name:
+                    quote_conventions_by_name[convention.name] = convention
+                convention_votes[convention.name] += score * quote_convention_analysis._analysis_weight
+
+        quote_convention_analysis_builder = QuoteConventionAnalysis.Builder(total_tabulated_quotation_marks)
+
+        for convention_name, total_score in convention_votes.items():
+            if total_score > 0:
+                quote_convention_analysis_builder.record_convention_score(
+                    quote_conventions_by_name[convention_name], total_score / total_weight
+                )
+
+        return quote_convention_analysis_builder.build()
diff --git a/machine/punctuation_analysis/quote_convention_detector.py b/machine/punctuation_analysis/quote_convention_detector.py
@@ -1,4 +1,3 @@
-from dataclasses import dataclass
 from typing import Dict, List, Optional
 
 from .chapter import Chapter
@@ -8,20 +7,13 @@
 from .quotation_mark_metadata import QuotationMarkMetadata
 from .quotation_mark_string_match import QuotationMarkStringMatch
 from .quotation_mark_tabulator import QuotationMarkTabulator
-from .quote_convention import QuoteConvention
+from .quote_convention_analysis import QuoteConventionAnalysis
 from .quote_convention_detection_resolution_settings import QuoteConventionDetectionResolutionSettings
 from .quote_convention_set import QuoteConventionSet
 from .standard_quote_conventions import STANDARD_QUOTE_CONVENTIONS
 from .usfm_structure_extractor import UsfmStructureExtractor
 
 
-@dataclass(frozen=True)
-class QuoteConventionAnalysis:
-    best_quote_convention: QuoteConvention
-    best_quote_convention_score: float
-    analysis_summary: str
-
-
 class QuoteConventionDetector(UsfmStructureExtractor):
 
     def __init__(self):
@@ -53,15 +45,7 @@ def _count_quotation_marks_in_chapter(
 
     def detect_quote_convention(
         self, include_chapters: Optional[Dict[int, List[int]]] = None
-    ) -> Optional[QuoteConventionAnalysis]:
+    ) -> QuoteConventionAnalysis:
         self._count_quotation_marks_in_chapters(self.get_chapters(include_chapters))
 
-        (best_quote_convention, score) = STANDARD_QUOTE_CONVENTIONS.find_most_similar_convention(
-            self._quotation_mark_tabulator
-        )
-
-        if score > 0 and best_quote_convention is not None:
-            return QuoteConventionAnalysis(
-                best_quote_convention, score, self._quotation_mark_tabulator.get_summary_message()
-            )
-        return None
+        return STANDARD_QUOTE_CONVENTIONS.score_all_quote_conventions(self._quotation_mark_tabulator)
diff --git a/machine/punctuation_analysis/quote_convention_set.py b/machine/punctuation_analysis/quote_convention_set.py
@@ -7,6 +7,7 @@
 from .quotation_mark_direction import QuotationMarkDirection
 from .quotation_mark_tabulator import QuotationMarkTabulator
 from .quote_convention import QuoteConvention
+from .quote_convention_analysis import QuoteConventionAnalysis
 
 
 class QuoteConventionSet:
@@ -149,3 +150,11 @@ def find_most_similar_convention(
                 best_quote_convention = quote_convention
 
         return (best_quote_convention, best_similarity)
+
+    def score_all_quote_conventions(self, tabulated_quotation_marks: QuotationMarkTabulator) -> QuoteConventionAnalysis:
+        quote_convention_analysis_builder = QuoteConventionAnalysis.Builder(tabulated_quotation_marks)
+        for quote_convention in self._conventions:
+            score = tabulated_quotation_marks.calculate_similarity(quote_convention)
+            quote_convention_analysis_builder.record_convention_score(quote_convention, score)
+
+        return quote_convention_analysis_builder.build()
diff --git a/machine/punctuation_analysis/standard_quote_conventions.py b/machine/punctuation_analysis/standard_quote_conventions.py
@@ -187,6 +187,15 @@
             [
                 SingleLevelQuoteConvention("\u00ab", "\u00bb"),
                 SingleLevelQuoteConvention("\u2019", "\u2018"),
+                SingleLevelQuoteConvention("\u201d", "\u201c"),
+            ],
+        ),
+        QuoteConvention(
+            "arabic_inspired_western_european",
+            [
+                SingleLevelQuoteConvention("\u00ab", "\u00bb"),
+                SingleLevelQuoteConvention("\u201d", "\u201c"),
+                SingleLevelQuoteConvention("\u2019", "\u2018"),
             ],
         ),
     ]
diff --git a/machine/punctuation_analysis/zip_paratext_project_quote_convention_detector.py b/machine/punctuation_analysis/zip_paratext_project_quote_convention_detector.py
@@ -2,12 +2,14 @@
 from typing import BinaryIO, Optional
 from zipfile import ZipFile
 
+from ..corpora.zip_paratext_project_settings_parser import ZipParatextProjectSettingsParser
 from .paratext_project_quote_convention_detector import ParatextProjectQuoteConventionDetector
 
 
 class ZipParatextProjectQuoteConventionDetector(ParatextProjectQuoteConventionDetector):
     def __init__(self, archive: ZipFile) -> None:
         self._archive = archive
+        super().__init__(ZipParatextProjectSettingsParser(archive))
 
     def _exists(self, file_name: str) -> bool:
         return file_name in self._archive.namelist()
diff --git a/tests/corpora/test_usfm_manual.py b/tests/corpora/test_usfm_manual.py
@@ -24,7 +24,7 @@
     ZipParatextProjectSettingsParser,
     ZipParatextProjectTextUpdater,
 )
-from machine.punctuation_analysis import QuoteConventionDetector, ZipParatextProjectQuoteConventionDetector
+from machine.punctuation_analysis import ZipParatextProjectQuoteConventionDetector
 
 
 @pytest.mark.skip(reason="This is for manual testing only. Remove this decorator to run the test.")
@@ -135,18 +135,13 @@ def get_usfm(project_path: Path):
 
 @pytest.mark.skip(reason="This is for manual testing only. Remove this decorator to run the test.")
 def test_analyze_corpora_quote_conventions():
-    source_handler = QuoteConventionDetector()
     source_archive = zipfile.ZipFile(USFM_SOURCE_PROJECT_ZIP_PATH, "r")
     source_quote_convention_detector = ZipParatextProjectQuoteConventionDetector(source_archive)
-    source_quote_convention_detector.get_quote_convention_analysis(source_handler)
+    source_analysis = source_quote_convention_detector.get_quote_convention_analysis()
 
-    target_handler = QuoteConventionDetector()
     target_archive = zipfile.ZipFile(USFM_TARGET_PROJECT_ZIP_PATH, "r")
     target_quote_convention_detector = ZipParatextProjectQuoteConventionDetector(target_archive)
-    target_quote_convention_detector.get_quote_convention_analysis(target_handler)
+    target_analysis = target_quote_convention_detector.get_quote_convention_analysis()
 
-    source_analysis = source_handler.detect_quote_convention()
-    target_analysis = target_handler.detect_quote_convention()
-
-    assert source_analysis is not None
-    assert target_analysis is not None
+    assert source_analysis.best_quote_convention is not None
+    assert target_analysis.best_quote_convention is not None
diff --git a/tests/punctuation_analysis/test_paratext_project_quote_convention_detector.py b/tests/punctuation_analysis/test_paratext_project_quote_convention_detector.py
diff --git a/tests/punctuation_analysis/test_quotation_mark_tabulator.py b/tests/punctuation_analysis/test_quotation_mark_tabulator.py
diff --git a/tests/punctuation_analysis/test_quote_convention_detector.py b/tests/punctuation_analysis/test_quote_convention_detector.py
diff --git a/tests/punctuation_analysis/test_quote_convention_set.py b/tests/punctuation_analysis/test_quote_convention_set.py