Skip to content

Commit 0c3cd9c

Browse files
benjaminkingBen King
andauthored
Improved quote convention detection for Paratext projects (#239)
* Use a weighted average of books for Paratext project quote convention detection * Always return a QuoteConventionAnalysis instead of None * Modify quote convention similarity calculation * Add new quote convention * Minor code clarity changes * Fix linting issue * Address reviewer comments + refactor weighted average --------- Co-authored-by: Ben King <[email protected]>
1 parent 9d71953 commit 0c3cd9c

14 files changed

+258
-88
lines changed

machine/punctuation_analysis/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,10 @@
3232
from .quotation_mark_update_settings import QuotationMarkUpdateSettings
3333
from .quotation_mark_update_strategy import QuotationMarkUpdateStrategy
3434
from .quote_convention import QuoteConvention, SingleLevelQuoteConvention
35+
from .quote_convention_analysis import QuoteConventionAnalysis
3536
from .quote_convention_changing_usfm_update_block_handler import QuoteConventionChangingUsfmUpdateBlockHandler
3637
from .quote_convention_detection_resolution_settings import QuoteConventionDetectionResolutionSettings
37-
from .quote_convention_detector import QuoteConventionAnalysis, QuoteConventionDetector
38+
from .quote_convention_detector import QuoteConventionDetector
3839
from .quote_convention_set import QuoteConventionSet
3940
from .standard_quote_conventions import STANDARD_QUOTE_CONVENTIONS
4041
from .text_segment import TextSegment

machine/punctuation_analysis/paratext_project_quote_convention_detector.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@
66
from ..corpora.usfm_parser import parse_usfm
77
from ..scripture.canon import book_id_to_number, get_scripture_books
88
from ..utils.typeshed import StrPath
9-
from .quote_convention_detector import QuoteConventionAnalysis, QuoteConventionDetector
9+
from .quote_convention_analysis import QuoteConventionAnalysis
10+
from .quote_convention_detector import QuoteConventionDetector
1011

1112

1213
class ParatextProjectQuoteConventionDetector(ABC):
@@ -17,15 +18,20 @@ def __init__(self, settings: Union[ParatextProjectSettings, ParatextProjectSetti
1718
self._settings = settings
1819

1920
def get_quote_convention_analysis(
20-
self, handler: Optional[QuoteConventionDetector] = None, include_chapters: Optional[Dict[int, List[int]]] = None
21-
) -> Optional[QuoteConventionAnalysis]:
22-
handler = QuoteConventionDetector() if handler is None else handler
21+
self, include_chapters: Optional[Dict[int, List[int]]] = None
22+
) -> QuoteConventionAnalysis:
23+
24+
book_quote_convention_analyses: List[QuoteConventionAnalysis] = []
25+
2326
for book_id in get_scripture_books():
2427
if include_chapters is not None and book_id_to_number(book_id) not in include_chapters:
2528
continue
2629
file_name: str = self._settings.get_book_file_name(book_id)
2730
if not self._exists(file_name):
2831
continue
32+
33+
handler = QuoteConventionDetector()
34+
2935
with self._open(file_name) as sfm_file:
3036
usfm: str = sfm_file.read().decode(self._settings.encoding)
3137
try:
@@ -37,7 +43,11 @@ def get_quote_convention_analysis(
3743
f". Error: '{e}'"
3844
)
3945
raise RuntimeError(error_message) from e
40-
return handler.detect_quote_convention(include_chapters)
46+
47+
quote_convention_analysis = handler.detect_quote_convention(include_chapters)
48+
book_quote_convention_analyses.append(quote_convention_analysis)
49+
50+
return QuoteConventionAnalysis.combine_with_weighted_average(book_quote_convention_analyses)
4151

4252
@abstractmethod
4353
def _exists(self, file_name: StrPath) -> bool: ...

machine/punctuation_analysis/quotation_mark_tabulator.py

Lines changed: 39 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from collections import Counter, defaultdict
2-
from typing import List
2+
from typing import Dict, List
33

44
from .quotation_mark_direction import QuotationMarkDirection
55
from .quotation_mark_metadata import QuotationMarkMetadata
@@ -15,6 +15,10 @@ def count_quotation_mark(self, quotation_mark: str) -> None:
1515
self._quotation_mark_counter.update([quotation_mark])
1616
self._total_count += 1
1717

18+
def count_from(self, quotation_mark_counts: "QuotationMarkCounts") -> None:
19+
self._quotation_mark_counter.update(quotation_mark_counts._quotation_mark_counter)
20+
self._total_count += quotation_mark_counts._total_count
21+
1822
def find_best_quotation_mark_proportion(self) -> tuple[str, int, int]:
1923
return self._quotation_mark_counter.most_common(1)[0] + (self._total_count,)
2024

@@ -36,6 +40,13 @@ def tabulate(self, quotation_marks: list[QuotationMarkMetadata]) -> None:
3640
for quotation_mark in quotation_marks:
3741
self._count_quotation_mark(quotation_mark)
3842

43+
def tabulate_from(self, tabulated_quotation_marks: "QuotationMarkTabulator") -> None:
44+
for (
45+
depth_and_direction,
46+
quotation_mark_counts,
47+
) in tabulated_quotation_marks._quotation_counts_by_depth_and_direction.items():
48+
self._quotation_counts_by_depth_and_direction[depth_and_direction].count_from(quotation_mark_counts)
49+
3950
def _count_quotation_mark(self, quotation_mark: QuotationMarkMetadata) -> None:
4051
key = (quotation_mark.depth, quotation_mark.direction)
4152
self._quotation_counts_by_depth_and_direction[key].count_quotation_mark(quotation_mark.quotation_mark)
@@ -48,23 +59,39 @@ def _find_most_common_quotation_mark_with_depth_and_direction(
4859
) -> tuple[str, int, int]:
4960
return self._quotation_counts_by_depth_and_direction[(depth, direction)].find_best_quotation_mark_proportion()
5061

62+
def get_total_quotation_mark_count(self) -> int:
63+
total_count = 0
64+
for counts in self._quotation_counts_by_depth_and_direction.values():
65+
total_count += counts.get_observed_count()
66+
return total_count
67+
5168
def calculate_similarity(self, quote_convention: QuoteConvention) -> float:
52-
weighted_difference = 0
53-
total_weight = 0
54-
for depth, direction in self._quotation_counts_by_depth_and_direction:
69+
num_marks_by_depth: Dict[int, int] = defaultdict(int)
70+
num_matching_marks_by_depth: Dict[int, int] = defaultdict(int)
71+
72+
for depth, direction in sorted(self._quotation_counts_by_depth_and_direction, key=lambda item: item[0]):
5573
expected_quotation_mark: str = quote_convention.get_expected_quotation_mark(depth, direction)
5674

57-
# Give higher weight to shallower depths, since deeper marks are more likely to be mistakes
58-
weighted_difference += self._quotation_counts_by_depth_and_direction[
75+
num_matching_marks = self._quotation_counts_by_depth_and_direction[(depth, direction)].get_observed_count()
76+
num_marks_by_depth[depth] += num_matching_marks
77+
num_matching_marks_by_depth[depth] += num_matching_marks - self._quotation_counts_by_depth_and_direction[
5978
(depth, direction)
60-
].calculate_num_differences(expected_quotation_mark) * 2 ** (-depth)
61-
total_weight += self._quotation_counts_by_depth_and_direction[
62-
(depth, direction)
63-
].get_observed_count() * 2 ** (-depth)
79+
].calculate_num_differences(expected_quotation_mark)
80+
81+
# The scores of greater depths depend on the scores of shallower depths
82+
scores_by_depth: Dict[int, float] = defaultdict(float)
83+
for depth in sorted(num_marks_by_depth.keys()):
84+
previous_depth_score = (
85+
scores_by_depth[depth - 1] / num_marks_by_depth[depth - 1] if depth - 1 in scores_by_depth else 1
86+
)
87+
scores_by_depth[depth] = previous_depth_score * num_matching_marks_by_depth[depth]
88+
89+
total_marks = sum(num_marks_by_depth.values())
90+
total_score = sum(scores_by_depth.values())
6491

65-
if total_weight == 0:
92+
if total_marks == 0:
6693
return 0
67-
return 1 - (weighted_difference / total_weight)
94+
return total_score / total_marks
6895

6996
def get_summary_message(self) -> str:
7097
message_lines: List[str] = []

machine/punctuation_analysis/quote_convention.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,9 @@ def normalize(self) -> "SingleLevelQuoteConvention":
3737
)
3838
return SingleLevelQuoteConvention(normalized_opening_quotation_mark, normalized_closing_quotation_mark)
3939

40+
def __hash__(self) -> int:
41+
return hash((self.opening_quotation_mark, self.closing_quotation_mark))
42+
4043

4144
class QuoteConvention:
4245
def __init__(self, name: str, level_conventions: list[SingleLevelQuoteConvention]):
@@ -57,6 +60,9 @@ def __eq__(self, value):
5760
return False
5861
return True
5962

63+
def __hash__(self) -> int:
64+
return hash(tuple(self.level_conventions))
65+
6066
@property
6167
def name(self) -> str:
6268
return self._name
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
from collections import defaultdict
2+
from typing import Dict, List, Optional
3+
4+
from .quotation_mark_tabulator import QuotationMarkTabulator
5+
from .quote_convention import QuoteConvention
6+
7+
8+
class QuoteConventionAnalysis:
9+
10+
def __init__(
11+
self,
12+
convention_scores: dict[QuoteConvention, float],
13+
tabulated_quotation_marks: QuotationMarkTabulator,
14+
analysis_weight: float = 1.0, # weight is used for combining scores for multiple books
15+
):
16+
self._convention_scores = convention_scores
17+
if len(convention_scores) > 0:
18+
(self._best_quote_convention, self._best_quote_convention_score) = max(
19+
convention_scores.items(), key=lambda item: item[1]
20+
)
21+
else:
22+
self._best_quote_convention_score = 0
23+
self._best_quote_convention = None
24+
25+
self._tabulated_quotation_marks = tabulated_quotation_marks
26+
self._analysis_weight = analysis_weight
27+
28+
@property
29+
def analysis_summary(self) -> str:
30+
return self._tabulated_quotation_marks.get_summary_message()
31+
32+
@property
33+
def best_quote_convention(self) -> Optional[QuoteConvention]:
34+
return self._best_quote_convention
35+
36+
@property
37+
def best_quote_convention_score(self) -> float:
38+
return self._best_quote_convention_score
39+
40+
class Builder:
41+
def __init__(self, tabulated_quotation_marks: QuotationMarkTabulator):
42+
self._convention_scores: dict[QuoteConvention, float] = {}
43+
self._tabulated_quotation_marks = tabulated_quotation_marks
44+
45+
def record_convention_score(self, quote_convention: QuoteConvention, score: float) -> None:
46+
self._convention_scores[quote_convention] = score
47+
48+
def build(self) -> "QuoteConventionAnalysis":
49+
return QuoteConventionAnalysis(
50+
self._convention_scores,
51+
self._tabulated_quotation_marks,
52+
self._tabulated_quotation_marks.get_total_quotation_mark_count(),
53+
)
54+
55+
@staticmethod
56+
def combine_with_weighted_average(
57+
quote_convention_analyses: List["QuoteConventionAnalysis"],
58+
) -> "QuoteConventionAnalysis":
59+
total_weight: float = 0
60+
convention_votes: Dict[str, float] = defaultdict(float)
61+
quote_conventions_by_name: Dict[str, QuoteConvention] = {}
62+
total_tabulated_quotation_marks = QuotationMarkTabulator()
63+
for quote_convention_analysis in quote_convention_analyses:
64+
total_tabulated_quotation_marks.tabulate_from(quote_convention_analysis._tabulated_quotation_marks)
65+
total_weight += quote_convention_analysis._analysis_weight
66+
for convention, score in quote_convention_analysis._convention_scores.items():
67+
if convention.name not in quote_conventions_by_name:
68+
quote_conventions_by_name[convention.name] = convention
69+
convention_votes[convention.name] += score * quote_convention_analysis._analysis_weight
70+
71+
quote_convention_analysis_builder = QuoteConventionAnalysis.Builder(total_tabulated_quotation_marks)
72+
73+
for convention_name, total_score in convention_votes.items():
74+
if total_score > 0:
75+
quote_convention_analysis_builder.record_convention_score(
76+
quote_conventions_by_name[convention_name], total_score / total_weight
77+
)
78+
79+
return quote_convention_analysis_builder.build()

machine/punctuation_analysis/quote_convention_detector.py

Lines changed: 3 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
from dataclasses import dataclass
21
from typing import Dict, List, Optional
32

43
from .chapter import Chapter
@@ -8,20 +7,13 @@
87
from .quotation_mark_metadata import QuotationMarkMetadata
98
from .quotation_mark_string_match import QuotationMarkStringMatch
109
from .quotation_mark_tabulator import QuotationMarkTabulator
11-
from .quote_convention import QuoteConvention
10+
from .quote_convention_analysis import QuoteConventionAnalysis
1211
from .quote_convention_detection_resolution_settings import QuoteConventionDetectionResolutionSettings
1312
from .quote_convention_set import QuoteConventionSet
1413
from .standard_quote_conventions import STANDARD_QUOTE_CONVENTIONS
1514
from .usfm_structure_extractor import UsfmStructureExtractor
1615

1716

18-
@dataclass(frozen=True)
19-
class QuoteConventionAnalysis:
20-
best_quote_convention: QuoteConvention
21-
best_quote_convention_score: float
22-
analysis_summary: str
23-
24-
2517
class QuoteConventionDetector(UsfmStructureExtractor):
2618

2719
def __init__(self):
@@ -53,15 +45,7 @@ def _count_quotation_marks_in_chapter(
5345

5446
def detect_quote_convention(
5547
self, include_chapters: Optional[Dict[int, List[int]]] = None
56-
) -> Optional[QuoteConventionAnalysis]:
48+
) -> QuoteConventionAnalysis:
5749
self._count_quotation_marks_in_chapters(self.get_chapters(include_chapters))
5850

59-
(best_quote_convention, score) = STANDARD_QUOTE_CONVENTIONS.find_most_similar_convention(
60-
self._quotation_mark_tabulator
61-
)
62-
63-
if score > 0 and best_quote_convention is not None:
64-
return QuoteConventionAnalysis(
65-
best_quote_convention, score, self._quotation_mark_tabulator.get_summary_message()
66-
)
67-
return None
51+
return STANDARD_QUOTE_CONVENTIONS.score_all_quote_conventions(self._quotation_mark_tabulator)

machine/punctuation_analysis/quote_convention_set.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from .quotation_mark_direction import QuotationMarkDirection
88
from .quotation_mark_tabulator import QuotationMarkTabulator
99
from .quote_convention import QuoteConvention
10+
from .quote_convention_analysis import QuoteConventionAnalysis
1011

1112

1213
class QuoteConventionSet:
@@ -149,3 +150,11 @@ def find_most_similar_convention(
149150
best_quote_convention = quote_convention
150151

151152
return (best_quote_convention, best_similarity)
153+
154+
def score_all_quote_conventions(self, tabulated_quotation_marks: QuotationMarkTabulator) -> QuoteConventionAnalysis:
155+
quote_convention_analysis_builder = QuoteConventionAnalysis.Builder(tabulated_quotation_marks)
156+
for quote_convention in self._conventions:
157+
score = tabulated_quotation_marks.calculate_similarity(quote_convention)
158+
quote_convention_analysis_builder.record_convention_score(quote_convention, score)
159+
160+
return quote_convention_analysis_builder.build()

machine/punctuation_analysis/standard_quote_conventions.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,15 @@
187187
[
188188
SingleLevelQuoteConvention("\u00ab", "\u00bb"),
189189
SingleLevelQuoteConvention("\u2019", "\u2018"),
190+
SingleLevelQuoteConvention("\u201d", "\u201c"),
191+
],
192+
),
193+
QuoteConvention(
194+
"arabic_inspired_western_european",
195+
[
196+
SingleLevelQuoteConvention("\u00ab", "\u00bb"),
197+
SingleLevelQuoteConvention("\u201d", "\u201c"),
198+
SingleLevelQuoteConvention("\u2019", "\u2018"),
190199
],
191200
),
192201
]

machine/punctuation_analysis/zip_paratext_project_quote_convention_detector.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,14 @@
22
from typing import BinaryIO, Optional
33
from zipfile import ZipFile
44

5+
from ..corpora.zip_paratext_project_settings_parser import ZipParatextProjectSettingsParser
56
from .paratext_project_quote_convention_detector import ParatextProjectQuoteConventionDetector
67

78

89
class ZipParatextProjectQuoteConventionDetector(ParatextProjectQuoteConventionDetector):
910
def __init__(self, archive: ZipFile) -> None:
1011
self._archive = archive
12+
super().__init__(ZipParatextProjectSettingsParser(archive))
1113

1214
def _exists(self, file_name: str) -> bool:
1315
return file_name in self._archive.namelist()

tests/corpora/test_usfm_manual.py

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
ZipParatextProjectSettingsParser,
2525
ZipParatextProjectTextUpdater,
2626
)
27-
from machine.punctuation_analysis import QuoteConventionDetector, ZipParatextProjectQuoteConventionDetector
27+
from machine.punctuation_analysis import ZipParatextProjectQuoteConventionDetector
2828

2929

3030
@pytest.mark.skip(reason="This is for manual testing only. Remove this decorator to run the test.")
@@ -135,18 +135,13 @@ def get_usfm(project_path: Path):
135135

136136
@pytest.mark.skip(reason="This is for manual testing only. Remove this decorator to run the test.")
137137
def test_analyze_corpora_quote_conventions():
138-
source_handler = QuoteConventionDetector()
139138
source_archive = zipfile.ZipFile(USFM_SOURCE_PROJECT_ZIP_PATH, "r")
140139
source_quote_convention_detector = ZipParatextProjectQuoteConventionDetector(source_archive)
141-
source_quote_convention_detector.get_quote_convention_analysis(source_handler)
140+
source_analysis = source_quote_convention_detector.get_quote_convention_analysis()
142141

143-
target_handler = QuoteConventionDetector()
144142
target_archive = zipfile.ZipFile(USFM_TARGET_PROJECT_ZIP_PATH, "r")
145143
target_quote_convention_detector = ZipParatextProjectQuoteConventionDetector(target_archive)
146-
target_quote_convention_detector.get_quote_convention_analysis(target_handler)
144+
target_analysis = target_quote_convention_detector.get_quote_convention_analysis()
147145

148-
source_analysis = source_handler.detect_quote_convention()
149-
target_analysis = target_handler.detect_quote_convention()
150-
151-
assert source_analysis is not None
152-
assert target_analysis is not None
146+
assert source_analysis.best_quote_convention is not None
147+
assert target_analysis.best_quote_convention is not None

0 commit comments

Comments
 (0)