Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion machine/punctuation_analysis/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,10 @@
from .quotation_mark_update_settings import QuotationMarkUpdateSettings
from .quotation_mark_update_strategy import QuotationMarkUpdateStrategy
from .quote_convention import QuoteConvention, SingleLevelQuoteConvention
from .quote_convention_analysis import QuoteConventionAnalysis
from .quote_convention_changing_usfm_update_block_handler import QuoteConventionChangingUsfmUpdateBlockHandler
from .quote_convention_detection_resolution_settings import QuoteConventionDetectionResolutionSettings
from .quote_convention_detector import QuoteConventionAnalysis, QuoteConventionDetector
from .quote_convention_detector import QuoteConventionDetector
from .quote_convention_set import QuoteConventionSet
from .standard_quote_conventions import STANDARD_QUOTE_CONVENTIONS
from .text_segment import TextSegment
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
from ..corpora.usfm_parser import parse_usfm
from ..scripture.canon import book_id_to_number, get_scripture_books
from ..utils.typeshed import StrPath
from .quote_convention_detector import QuoteConventionAnalysis, QuoteConventionDetector
from .quote_convention_analysis import QuoteConventionAnalysis
from .quote_convention_detector import QuoteConventionDetector


class ParatextProjectQuoteConventionDetector(ABC):
Expand All @@ -17,15 +18,20 @@ def __init__(self, settings: Union[ParatextProjectSettings, ParatextProjectSetti
self._settings = settings

def get_quote_convention_analysis(
self, handler: Optional[QuoteConventionDetector] = None, include_chapters: Optional[Dict[int, List[int]]] = None
) -> Optional[QuoteConventionAnalysis]:
handler = QuoteConventionDetector() if handler is None else handler
self, include_chapters: Optional[Dict[int, List[int]]] = None
) -> QuoteConventionAnalysis:

book_quote_convention_analyses: List[QuoteConventionAnalysis] = []

for book_id in get_scripture_books():
if include_chapters is not None and book_id_to_number(book_id) not in include_chapters:
continue
file_name: str = self._settings.get_book_file_name(book_id)
if not self._exists(file_name):
continue

handler = QuoteConventionDetector()

with self._open(file_name) as sfm_file:
usfm: str = sfm_file.read().decode(self._settings.encoding)
try:
Expand All @@ -37,7 +43,11 @@ def get_quote_convention_analysis(
f". Error: '{e}'"
)
raise RuntimeError(error_message) from e
return handler.detect_quote_convention(include_chapters)

quote_convention_analysis = handler.detect_quote_convention(include_chapters)
book_quote_convention_analyses.append(quote_convention_analysis)

return QuoteConventionAnalysis.combine_with_weighted_average(book_quote_convention_analyses)

@abstractmethod
def _exists(self, file_name: StrPath) -> bool: ...
Expand Down
51 changes: 39 additions & 12 deletions machine/punctuation_analysis/quotation_mark_tabulator.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from collections import Counter, defaultdict
from typing import List
from typing import Dict, List

from .quotation_mark_direction import QuotationMarkDirection
from .quotation_mark_metadata import QuotationMarkMetadata
Expand All @@ -15,6 +15,10 @@ def count_quotation_mark(self, quotation_mark: str) -> None:
self._quotation_mark_counter.update([quotation_mark])
self._total_count += 1

def count_from(self, quotation_mark_counts: "QuotationMarkCounts") -> None:
self._quotation_mark_counter.update(quotation_mark_counts._quotation_mark_counter)
self._total_count += quotation_mark_counts._total_count

def find_best_quotation_mark_proportion(self) -> tuple[str, int, int]:
return self._quotation_mark_counter.most_common(1)[0] + (self._total_count,)

Expand All @@ -36,6 +40,13 @@ def tabulate(self, quotation_marks: list[QuotationMarkMetadata]) -> None:
for quotation_mark in quotation_marks:
self._count_quotation_mark(quotation_mark)

def tabulate_from(self, tabulated_quotation_marks: "QuotationMarkTabulator") -> None:
for (
depth_and_direction,
quotation_mark_counts,
) in tabulated_quotation_marks._quotation_counts_by_depth_and_direction.items():
self._quotation_counts_by_depth_and_direction[depth_and_direction].count_from(quotation_mark_counts)

def _count_quotation_mark(self, quotation_mark: QuotationMarkMetadata) -> None:
key = (quotation_mark.depth, quotation_mark.direction)
self._quotation_counts_by_depth_and_direction[key].count_quotation_mark(quotation_mark.quotation_mark)
Expand All @@ -48,23 +59,39 @@ def _find_most_common_quotation_mark_with_depth_and_direction(
) -> tuple[str, int, int]:
return self._quotation_counts_by_depth_and_direction[(depth, direction)].find_best_quotation_mark_proportion()

def get_total_quotation_mark_count(self) -> int:
total_count = 0
for counts in self._quotation_counts_by_depth_and_direction.values():
total_count += counts.get_observed_count()
return total_count

def calculate_similarity(self, quote_convention: QuoteConvention) -> float:
weighted_difference = 0
total_weight = 0
for depth, direction in self._quotation_counts_by_depth_and_direction:
num_marks_by_depth: Dict[int, int] = defaultdict(int)
num_matching_marks_by_depth: Dict[int, int] = defaultdict(int)

for depth, direction in sorted(self._quotation_counts_by_depth_and_direction, key=lambda item: item[0]):
expected_quotation_mark: str = quote_convention.get_expected_quotation_mark(depth, direction)

# Give higher weight to shallower depths, since deeper marks are more likely to be mistakes
weighted_difference += self._quotation_counts_by_depth_and_direction[
num_matching_marks = self._quotation_counts_by_depth_and_direction[(depth, direction)].get_observed_count()
num_marks_by_depth[depth] += num_matching_marks
num_matching_marks_by_depth[depth] += num_matching_marks - self._quotation_counts_by_depth_and_direction[
(depth, direction)
].calculate_num_differences(expected_quotation_mark) * 2 ** (-depth)
total_weight += self._quotation_counts_by_depth_and_direction[
(depth, direction)
].get_observed_count() * 2 ** (-depth)
].calculate_num_differences(expected_quotation_mark)

# The scores of greater depths depend on the scores of shallower depths
scores_by_depth: Dict[int, float] = defaultdict(float)
for depth in sorted(num_marks_by_depth.keys()):
previous_depth_score = (
scores_by_depth[depth - 1] / num_marks_by_depth[depth - 1] if depth - 1 in scores_by_depth else 1
)
scores_by_depth[depth] = previous_depth_score * num_matching_marks_by_depth[depth]

total_marks = sum(num_marks_by_depth.values())
total_score = sum(scores_by_depth.values())

if total_weight == 0:
if total_marks == 0:
return 0
return 1 - (weighted_difference / total_weight)
return total_score / total_marks

def get_summary_message(self) -> str:
message_lines: List[str] = []
Expand Down
6 changes: 6 additions & 0 deletions machine/punctuation_analysis/quote_convention.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ def normalize(self) -> "SingleLevelQuoteConvention":
)
return SingleLevelQuoteConvention(normalized_opening_quotation_mark, normalized_closing_quotation_mark)

def __hash__(self) -> int:
return hash((self.opening_quotation_mark, self.closing_quotation_mark))


class QuoteConvention:
def __init__(self, name: str, level_conventions: list[SingleLevelQuoteConvention]):
Expand All @@ -57,6 +60,9 @@ def __eq__(self, value):
return False
return True

def __hash__(self) -> int:
return hash(tuple(self.level_conventions))

@property
def name(self) -> str:
return self._name
Expand Down
79 changes: 79 additions & 0 deletions machine/punctuation_analysis/quote_convention_analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
from collections import defaultdict
from typing import Dict, List, Optional

from .quotation_mark_tabulator import QuotationMarkTabulator
from .quote_convention import QuoteConvention


class QuoteConventionAnalysis:

def __init__(
self,
convention_scores: dict[QuoteConvention, float],
tabulated_quotation_marks: QuotationMarkTabulator,
analysis_weight: float = 1.0, # weight is used for combining scores for multiple books
):
self._convention_scores = convention_scores
if len(convention_scores) > 0:
(self._best_quote_convention, self._best_quote_convention_score) = max(
convention_scores.items(), key=lambda item: item[1]
)
else:
self._best_quote_convention_score = 0
self._best_quote_convention = None

self._tabulated_quotation_marks = tabulated_quotation_marks
self._analysis_weight = analysis_weight

@property
def analysis_summary(self) -> str:
return self._tabulated_quotation_marks.get_summary_message()

@property
def best_quote_convention(self) -> Optional[QuoteConvention]:
return self._best_quote_convention

@property
def best_quote_convention_score(self) -> float:
return self._best_quote_convention_score

class Builder:
def __init__(self, tabulated_quotation_marks: QuotationMarkTabulator):
self._convention_scores: dict[QuoteConvention, float] = {}
self._tabulated_quotation_marks = tabulated_quotation_marks

def record_convention_score(self, quote_convention: QuoteConvention, score: float) -> None:
self._convention_scores[quote_convention] = score

def build(self) -> "QuoteConventionAnalysis":
return QuoteConventionAnalysis(
self._convention_scores,
self._tabulated_quotation_marks,
self._tabulated_quotation_marks.get_total_quotation_mark_count(),
)

@staticmethod
def combine_with_weighted_average(
quote_convention_analyses: List["QuoteConventionAnalysis"],
) -> "QuoteConventionAnalysis":
total_weight: float = 0
convention_votes: Dict[str, float] = defaultdict(float)
quote_conventions_by_name: Dict[str, QuoteConvention] = {}
total_tabulated_quotation_marks = QuotationMarkTabulator()
for quote_convention_analysis in quote_convention_analyses:
total_tabulated_quotation_marks.tabulate_from(quote_convention_analysis._tabulated_quotation_marks)
total_weight += quote_convention_analysis._analysis_weight
for convention, score in quote_convention_analysis._convention_scores.items():
if convention.name not in quote_conventions_by_name:
quote_conventions_by_name[convention.name] = convention
convention_votes[convention.name] += score * quote_convention_analysis._analysis_weight

quote_convention_analysis_builder = QuoteConventionAnalysis.Builder(total_tabulated_quotation_marks)

for convention_name, total_score in convention_votes.items():
if total_score > 0:
quote_convention_analysis_builder.record_convention_score(
quote_conventions_by_name[convention_name], total_score / total_weight
)

return quote_convention_analysis_builder.build()
22 changes: 3 additions & 19 deletions machine/punctuation_analysis/quote_convention_detector.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from dataclasses import dataclass
from typing import Dict, List, Optional

from .chapter import Chapter
Expand All @@ -8,20 +7,13 @@
from .quotation_mark_metadata import QuotationMarkMetadata
from .quotation_mark_string_match import QuotationMarkStringMatch
from .quotation_mark_tabulator import QuotationMarkTabulator
from .quote_convention import QuoteConvention
from .quote_convention_analysis import QuoteConventionAnalysis
from .quote_convention_detection_resolution_settings import QuoteConventionDetectionResolutionSettings
from .quote_convention_set import QuoteConventionSet
from .standard_quote_conventions import STANDARD_QUOTE_CONVENTIONS
from .usfm_structure_extractor import UsfmStructureExtractor


@dataclass(frozen=True)
class QuoteConventionAnalysis:
best_quote_convention: QuoteConvention
best_quote_convention_score: float
analysis_summary: str


class QuoteConventionDetector(UsfmStructureExtractor):

def __init__(self):
Expand Down Expand Up @@ -53,15 +45,7 @@ def _count_quotation_marks_in_chapter(

def detect_quote_convention(
self, include_chapters: Optional[Dict[int, List[int]]] = None
) -> Optional[QuoteConventionAnalysis]:
) -> QuoteConventionAnalysis:
self._count_quotation_marks_in_chapters(self.get_chapters(include_chapters))

(best_quote_convention, score) = STANDARD_QUOTE_CONVENTIONS.find_most_similar_convention(
self._quotation_mark_tabulator
)

if score > 0 and best_quote_convention is not None:
return QuoteConventionAnalysis(
best_quote_convention, score, self._quotation_mark_tabulator.get_summary_message()
)
return None
return STANDARD_QUOTE_CONVENTIONS.score_all_quote_conventions(self._quotation_mark_tabulator)
9 changes: 9 additions & 0 deletions machine/punctuation_analysis/quote_convention_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from .quotation_mark_direction import QuotationMarkDirection
from .quotation_mark_tabulator import QuotationMarkTabulator
from .quote_convention import QuoteConvention
from .quote_convention_analysis import QuoteConventionAnalysis


class QuoteConventionSet:
Expand Down Expand Up @@ -149,3 +150,11 @@ def find_most_similar_convention(
best_quote_convention = quote_convention

return (best_quote_convention, best_similarity)

def score_all_quote_conventions(self, tabulated_quotation_marks: QuotationMarkTabulator) -> QuoteConventionAnalysis:
quote_convention_analysis_builder = QuoteConventionAnalysis.Builder(tabulated_quotation_marks)
for quote_convention in self._conventions:
score = tabulated_quotation_marks.calculate_similarity(quote_convention)
quote_convention_analysis_builder.record_convention_score(quote_convention, score)

return quote_convention_analysis_builder.build()
9 changes: 9 additions & 0 deletions machine/punctuation_analysis/standard_quote_conventions.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,15 @@
[
SingleLevelQuoteConvention("\u00ab", "\u00bb"),
SingleLevelQuoteConvention("\u2019", "\u2018"),
SingleLevelQuoteConvention("\u201d", "\u201c"),
],
),
QuoteConvention(
"arabic_inspired_western_european",
[
SingleLevelQuoteConvention("\u00ab", "\u00bb"),
SingleLevelQuoteConvention("\u201d", "\u201c"),
SingleLevelQuoteConvention("\u2019", "\u2018"),
],
),
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@
from typing import BinaryIO, Optional
from zipfile import ZipFile

from ..corpora.zip_paratext_project_settings_parser import ZipParatextProjectSettingsParser
from .paratext_project_quote_convention_detector import ParatextProjectQuoteConventionDetector


class ZipParatextProjectQuoteConventionDetector(ParatextProjectQuoteConventionDetector):
def __init__(self, archive: ZipFile) -> None:
self._archive = archive
super().__init__(ZipParatextProjectSettingsParser(archive))

def _exists(self, file_name: str) -> bool:
return file_name in self._archive.namelist()
Expand Down
15 changes: 5 additions & 10 deletions tests/corpora/test_usfm_manual.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
ZipParatextProjectSettingsParser,
ZipParatextProjectTextUpdater,
)
from machine.punctuation_analysis import QuoteConventionDetector, ZipParatextProjectQuoteConventionDetector
from machine.punctuation_analysis import ZipParatextProjectQuoteConventionDetector


@pytest.mark.skip(reason="This is for manual testing only. Remove this decorator to run the test.")
Expand Down Expand Up @@ -135,18 +135,13 @@ def get_usfm(project_path: Path):

@pytest.mark.skip(reason="This is for manual testing only. Remove this decorator to run the test.")
def test_analyze_corpora_quote_conventions():
source_handler = QuoteConventionDetector()
source_archive = zipfile.ZipFile(USFM_SOURCE_PROJECT_ZIP_PATH, "r")
source_quote_convention_detector = ZipParatextProjectQuoteConventionDetector(source_archive)
source_quote_convention_detector.get_quote_convention_analysis(source_handler)
source_analysis = source_quote_convention_detector.get_quote_convention_analysis()

target_handler = QuoteConventionDetector()
target_archive = zipfile.ZipFile(USFM_TARGET_PROJECT_ZIP_PATH, "r")
target_quote_convention_detector = ZipParatextProjectQuoteConventionDetector(target_archive)
target_quote_convention_detector.get_quote_convention_analysis(target_handler)
target_analysis = target_quote_convention_detector.get_quote_convention_analysis()

source_analysis = source_handler.detect_quote_convention()
target_analysis = target_handler.detect_quote_convention()

assert source_analysis is not None
assert target_analysis is not None
assert source_analysis.best_quote_convention is not None
assert target_analysis.best_quote_convention is not None
Loading
Loading