Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion machine/punctuation_analysis/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,10 @@
from .quotation_mark_update_settings import QuotationMarkUpdateSettings
from .quotation_mark_update_strategy import QuotationMarkUpdateStrategy
from .quote_convention import QuoteConvention, SingleLevelQuoteConvention
from .quote_convention_analysis import QuoteConventionAnalysis
from .quote_convention_changing_usfm_update_block_handler import QuoteConventionChangingUsfmUpdateBlockHandler
from .quote_convention_detection_resolution_settings import QuoteConventionDetectionResolutionSettings
from .quote_convention_detector import QuoteConventionAnalysis, QuoteConventionDetector
from .quote_convention_detector import QuoteConventionDetector
from .quote_convention_set import QuoteConventionSet
from .standard_quote_conventions import STANDARD_QUOTE_CONVENTIONS
from .text_segment import TextSegment
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,51 @@
from abc import ABC, abstractmethod
from collections import defaultdict
from typing import BinaryIO, Dict, List, Optional, Union

from ..corpora.paratext_project_settings import ParatextProjectSettings
from ..corpora.paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
from ..corpora.usfm_parser import parse_usfm
from ..scripture.canon import book_id_to_number, get_scripture_books
from ..utils.typeshed import StrPath
from .quote_convention_detector import QuoteConventionAnalysis, QuoteConventionDetector
from .quotation_mark_tabulator import QuotationMarkTabulator
from .quote_convention import QuoteConvention
from .quote_convention_analysis import QuoteConventionAnalysis
from .quote_convention_detector import QuoteConventionDetector


class WeightedAverageQuoteConventionAnalysisBuilder:
def __init__(self) -> None:
self._total_weight: float = 0
self._convention_votes: Dict[str, float] = defaultdict(float)
self._quote_conventions_by_name: Dict[str, QuoteConvention] = {}
self._total_tabulated_quotation_marks = QuotationMarkTabulator()

def record_book_results(
self,
quote_convention_analysis: QuoteConventionAnalysis,
tabulated_quotation_marks: QuotationMarkTabulator,
) -> None:
if quote_convention_analysis.best_quote_convention is None or quote_convention_analysis.weight == 0:
return

self._total_tabulated_quotation_marks.tabulate_from(tabulated_quotation_marks)

self._total_weight += quote_convention_analysis.weight
for convention, score in quote_convention_analysis.get_all_convention_scores():
if convention.name not in self._quote_conventions_by_name:
self._quote_conventions_by_name[convention.name] = convention
self._convention_votes[convention.name] += score * quote_convention_analysis.weight

def to_quote_convention_analysis(self) -> QuoteConventionAnalysis:
quote_convention_analysis_builder = QuoteConventionAnalysis.Builder(self._total_tabulated_quotation_marks)

for convention_name, total_score in self._convention_votes.items():
if total_score > 0:
quote_convention_analysis_builder.record_convention_score(
self._quote_conventions_by_name[convention_name], total_score / self._total_weight
)

return quote_convention_analysis_builder.build()


class ParatextProjectQuoteConventionDetector(ABC):
Expand All @@ -17,15 +56,20 @@ def __init__(self, settings: Union[ParatextProjectSettings, ParatextProjectSetti
self._settings = settings

def get_quote_convention_analysis(
self, handler: Optional[QuoteConventionDetector] = None, include_chapters: Optional[Dict[int, List[int]]] = None
) -> Optional[QuoteConventionAnalysis]:
handler = QuoteConventionDetector() if handler is None else handler
self, include_chapters: Optional[Dict[int, List[int]]] = None
) -> QuoteConventionAnalysis:

weighted_average_quote_convention_analysis_builder = WeightedAverageQuoteConventionAnalysisBuilder()

for book_id in get_scripture_books():
if include_chapters is not None and book_id_to_number(book_id) not in include_chapters:
continue
file_name: str = self._settings.get_book_file_name(book_id)
if not self._exists(file_name):
continue

handler = QuoteConventionDetector()

with self._open(file_name) as sfm_file:
usfm: str = sfm_file.read().decode(self._settings.encoding)
try:
Expand All @@ -37,7 +81,15 @@ def get_quote_convention_analysis(
f". Error: '{e}'"
)
raise RuntimeError(error_message) from e
return handler.detect_quote_convention(include_chapters)

quote_convention_analysis, tabulated_quotation_marks = (
handler.detect_quote_convention_and_get_tabulated_quotation_marks(include_chapters)
)
weighted_average_quote_convention_analysis_builder.record_book_results(
quote_convention_analysis, tabulated_quotation_marks
)

return weighted_average_quote_convention_analysis_builder.to_quote_convention_analysis()

@abstractmethod
def _exists(self, file_name: StrPath) -> bool: ...
Expand Down
51 changes: 39 additions & 12 deletions machine/punctuation_analysis/quotation_mark_tabulator.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from collections import Counter, defaultdict
from typing import List
from typing import Dict, List

from .quotation_mark_direction import QuotationMarkDirection
from .quotation_mark_metadata import QuotationMarkMetadata
Expand All @@ -15,6 +15,10 @@ def count_quotation_mark(self, quotation_mark: str) -> None:
self._quotation_mark_counter.update([quotation_mark])
self._total_count += 1

def count_from(self, quotation_mark_counts: "QuotationMarkCounts") -> None:
self._quotation_mark_counter.update(quotation_mark_counts._quotation_mark_counter)
self._total_count += quotation_mark_counts._total_count

def find_best_quotation_mark_proportion(self) -> tuple[str, int, int]:
return self._quotation_mark_counter.most_common(1)[0] + (self._total_count,)

Expand All @@ -36,6 +40,13 @@ def tabulate(self, quotation_marks: list[QuotationMarkMetadata]) -> None:
for quotation_mark in quotation_marks:
self._count_quotation_mark(quotation_mark)

def tabulate_from(self, tabulated_quotation_marks: "QuotationMarkTabulator") -> None:
for (
depth_and_direction,
quotation_mark_counts,
) in tabulated_quotation_marks._quotation_counts_by_depth_and_direction.items():
self._quotation_counts_by_depth_and_direction[depth_and_direction].count_from(quotation_mark_counts)

def _count_quotation_mark(self, quotation_mark: QuotationMarkMetadata) -> None:
key = (quotation_mark.depth, quotation_mark.direction)
self._quotation_counts_by_depth_and_direction[key].count_quotation_mark(quotation_mark.quotation_mark)
Expand All @@ -48,23 +59,39 @@ def _find_most_common_quotation_mark_with_depth_and_direction(
) -> tuple[str, int, int]:
return self._quotation_counts_by_depth_and_direction[(depth, direction)].find_best_quotation_mark_proportion()

def get_total_quotation_mark_count(self) -> int:
total_count = 0
for counts in self._quotation_counts_by_depth_and_direction.values():
total_count += counts.get_observed_count()
return total_count

def calculate_similarity(self, quote_convention: QuoteConvention) -> float:
weighted_difference = 0
total_weight = 0
for depth, direction in self._quotation_counts_by_depth_and_direction:
num_marks_by_depth: Dict[int, int] = defaultdict(int)
num_matching_marks_by_depth: Dict[int, int] = defaultdict(int)

for depth, direction in sorted(self._quotation_counts_by_depth_and_direction, key=lambda item: item[0]):
expected_quotation_mark: str = quote_convention.get_expected_quotation_mark(depth, direction)

# Give higher weight to shallower depths, since deeper marks are more likely to be mistakes
weighted_difference += self._quotation_counts_by_depth_and_direction[
num_matching_marks = self._quotation_counts_by_depth_and_direction[(depth, direction)].get_observed_count()
num_marks_by_depth[depth] += num_matching_marks
num_matching_marks_by_depth[depth] += num_matching_marks - self._quotation_counts_by_depth_and_direction[
(depth, direction)
].calculate_num_differences(expected_quotation_mark) * 2 ** (-depth)
total_weight += self._quotation_counts_by_depth_and_direction[
(depth, direction)
].get_observed_count() * 2 ** (-depth)
].calculate_num_differences(expected_quotation_mark)

# The scores of greater depths depend on the scores of shallower depths
scores_by_depth: Dict[int, float] = defaultdict(float)
for depth in sorted(num_marks_by_depth.keys()):
previous_depth_score = (
scores_by_depth[depth - 1] / num_marks_by_depth[depth - 1] if depth - 1 in scores_by_depth else 1
)
scores_by_depth[depth] = previous_depth_score * num_matching_marks_by_depth[depth]

total_marks = sum(num_marks_by_depth.values())
total_score = sum(scores_by_depth.values())

if total_weight == 0:
if total_marks == 0:
return 0
return 1 - (weighted_difference / total_weight)
return total_score / total_marks

def get_summary_message(self) -> str:
message_lines: List[str] = []
Expand Down
6 changes: 6 additions & 0 deletions machine/punctuation_analysis/quote_convention.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ def normalize(self) -> "SingleLevelQuoteConvention":
)
return SingleLevelQuoteConvention(normalized_opening_quotation_mark, normalized_closing_quotation_mark)

def __hash__(self) -> int:
return hash((self.opening_quotation_mark, self.closing_quotation_mark))


class QuoteConvention:
def __init__(self, name: str, level_conventions: list[SingleLevelQuoteConvention]):
Expand All @@ -57,6 +60,9 @@ def __eq__(self, value):
return False
return True

def __hash__(self) -> int:
return hash((tuple(self.level_conventions)))

@property
def name(self) -> str:
return self._name
Expand Down
61 changes: 61 additions & 0 deletions machine/punctuation_analysis/quote_convention_analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
from typing import Optional

from .quotation_mark_tabulator import QuotationMarkTabulator
from .quote_convention import QuoteConvention


class QuoteConventionAnalysis:

def __init__(
self,
convention_scores: dict[QuoteConvention, float],
tabulated_quotation_marks: QuotationMarkTabulator,
analysis_weight: float = 1.0, # weight is used for combining scores for multiple books
):
self._convention_scores = convention_scores
if len(convention_scores) > 0:
self._best_quote_convention_score = max(convention_scores.items(), key=lambda item: item[1])[1]
else:
self._best_quote_convention_score = 0

if self._best_quote_convention_score > 0:
self._best_quote_convention = max(convention_scores.items(), key=lambda item: item[1])[0]
else:
self._best_quote_convention = None

self._tabulated_quotation_marks = tabulated_quotation_marks
self._analysis_weight = analysis_weight

def get_all_convention_scores(self) -> list[tuple[QuoteConvention, float]]:
return list(self._convention_scores.items())

@property
def analysis_summary(self) -> str:
return self._tabulated_quotation_marks.get_summary_message()

@property
def best_quote_convention(self) -> Optional[QuoteConvention]:
return self._best_quote_convention

@property
def best_quote_convention_score(self) -> float:
return self._best_quote_convention_score

@property
def weight(self) -> float:
return self._analysis_weight

class Builder:
def __init__(self, tabulated_quotation_marks: QuotationMarkTabulator):
self._convention_scores: dict[QuoteConvention, float] = {}
self._tabulated_quotation_marks = tabulated_quotation_marks

def record_convention_score(self, quote_convention: QuoteConvention, score: float) -> None:
self._convention_scores[quote_convention] = score

def build(self) -> "QuoteConventionAnalysis":
return QuoteConventionAnalysis(
self._convention_scores,
self._tabulated_quotation_marks,
self._tabulated_quotation_marks.get_total_quotation_mark_count(),
)
32 changes: 13 additions & 19 deletions machine/punctuation_analysis/quote_convention_detector.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from dataclasses import dataclass
from typing import Dict, List, Optional
from typing import Dict, List, Optional, Tuple

from .chapter import Chapter
from .depth_based_quotation_mark_resolver import DepthBasedQuotationMarkResolver
Expand All @@ -8,20 +7,13 @@
from .quotation_mark_metadata import QuotationMarkMetadata
from .quotation_mark_string_match import QuotationMarkStringMatch
from .quotation_mark_tabulator import QuotationMarkTabulator
from .quote_convention import QuoteConvention
from .quote_convention_analysis import QuoteConventionAnalysis
from .quote_convention_detection_resolution_settings import QuoteConventionDetectionResolutionSettings
from .quote_convention_set import QuoteConventionSet
from .standard_quote_conventions import STANDARD_QUOTE_CONVENTIONS
from .usfm_structure_extractor import UsfmStructureExtractor


@dataclass(frozen=True)
class QuoteConventionAnalysis:
best_quote_convention: QuoteConvention
best_quote_convention_score: float
analysis_summary: str


class QuoteConventionDetector(UsfmStructureExtractor):

def __init__(self):
Expand Down Expand Up @@ -53,15 +45,17 @@ def _count_quotation_marks_in_chapter(

def detect_quote_convention(
self, include_chapters: Optional[Dict[int, List[int]]] = None
) -> Optional[QuoteConventionAnalysis]:
) -> QuoteConventionAnalysis:
self._count_quotation_marks_in_chapters(self.get_chapters(include_chapters))

(best_quote_convention, score) = STANDARD_QUOTE_CONVENTIONS.find_most_similar_convention(
self._quotation_mark_tabulator
)
return STANDARD_QUOTE_CONVENTIONS.score_all_quote_conventions(self._quotation_mark_tabulator)

if score > 0 and best_quote_convention is not None:
return QuoteConventionAnalysis(
best_quote_convention, score, self._quotation_mark_tabulator.get_summary_message()
)
return None
def detect_quote_convention_and_get_tabulated_quotation_marks(
self, include_chapters: Optional[Dict[int, List[int]]] = None
) -> Tuple[QuoteConventionAnalysis, QuotationMarkTabulator]:
self._count_quotation_marks_in_chapters(self.get_chapters(include_chapters))

return (
STANDARD_QUOTE_CONVENTIONS.score_all_quote_conventions(self._quotation_mark_tabulator),
self._quotation_mark_tabulator,
)
9 changes: 9 additions & 0 deletions machine/punctuation_analysis/quote_convention_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from .quotation_mark_direction import QuotationMarkDirection
from .quotation_mark_tabulator import QuotationMarkTabulator
from .quote_convention import QuoteConvention
from .quote_convention_analysis import QuoteConventionAnalysis


class QuoteConventionSet:
Expand Down Expand Up @@ -149,3 +150,11 @@ def find_most_similar_convention(
best_quote_convention = quote_convention

return (best_quote_convention, best_similarity)

def score_all_quote_conventions(self, tabulated_quotation_marks: QuotationMarkTabulator) -> QuoteConventionAnalysis:
quote_convention_analysis_builder = QuoteConventionAnalysis.Builder(tabulated_quotation_marks)
for quote_convention in self._conventions:
score = tabulated_quotation_marks.calculate_similarity(quote_convention)
quote_convention_analysis_builder.record_convention_score(quote_convention, score)

return quote_convention_analysis_builder.build()
9 changes: 9 additions & 0 deletions machine/punctuation_analysis/standard_quote_conventions.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,15 @@
[
SingleLevelQuoteConvention("\u00ab", "\u00bb"),
SingleLevelQuoteConvention("\u2019", "\u2018"),
SingleLevelQuoteConvention("\u201d", "\u201c"),
],
),
QuoteConvention(
"arabic_inspired_western_european",
[
SingleLevelQuoteConvention("\u00ab", "\u00bb"),
SingleLevelQuoteConvention("\u201d", "\u201c"),
SingleLevelQuoteConvention("\u2019", "\u2018"),
],
),
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@
from typing import BinaryIO, Optional
from zipfile import ZipFile

from ..corpora.zip_paratext_project_settings_parser import ZipParatextProjectSettingsParser
from .paratext_project_quote_convention_detector import ParatextProjectQuoteConventionDetector


class ZipParatextProjectQuoteConventionDetector(ParatextProjectQuoteConventionDetector):
def __init__(self, archive: ZipFile) -> None:
self._archive = archive
super().__init__(ZipParatextProjectSettingsParser(archive))

def _exists(self, file_name: str) -> bool:
return file_name in self._archive.namelist()
Expand Down
Loading
Loading