Skip to content

Commit 7fcb23e

Browse files
committed
Remove custom combining character handling
1 parent b053f5b commit 7fcb23e

File tree

7 files changed

+38
-105
lines changed

7 files changed

+38
-105
lines changed

machine/punctuation_analysis/quotation_mark_finder.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,8 @@ def find_all_potential_quotation_marks_in_text_segment(
4343
quotation_matches.append(
4444
QuotationMarkStringMatch(
4545
text_segment,
46-
text_segment.text.string_index_to_glyph_index(quotation_mark_match.start()),
47-
text_segment.text.string_index_to_glyph_index(quotation_mark_match.end()),
46+
quotation_mark_match.start(),
47+
quotation_mark_match.end(),
4848
)
4949
)
5050
return quotation_matches
Lines changed: 4 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import unicodedata
21
from typing import Optional, Set
32

43
from ..corpora.usfm_token import UsfmToken
@@ -7,7 +6,7 @@
76

87
class TextSegment:
98
def __init__(self):
10-
self._text: GlyphString = GlyphString("")
9+
self._text = ""
1110
self._immediate_preceding_marker: UsfmMarkerType = UsfmMarkerType.NO_MARKER
1211
self._markers_in_preceding_context: Set[UsfmMarkerType] = set()
1312
self.previous_segment: Optional[TextSegment] = None
@@ -32,7 +31,7 @@ def __eq__(self, value):
3231
return True
3332

3433
@property
35-
def text(self) -> "GlyphString":
34+
def text(self) -> str:
3635
return self._text
3736

3837
@property
@@ -55,7 +54,7 @@ def is_last_segment_in_verse(self) -> bool:
5554
return self.index_in_verse == self.num_segments_in_verse - 1
5655

5756
def replace_substring(self, start_index: int, end_index: int, replacement: str) -> None:
58-
self._text = GlyphString(self.substring_before(start_index) + replacement + self.substring_after(end_index))
57+
self._text = self.substring_before(start_index) + replacement + self.substring_after(end_index)
5958
if self._usfm_token is not None:
6059
self._usfm_token.text = str(self._text)
6160

@@ -77,70 +76,8 @@ def set_usfm_token(self, token: UsfmToken) -> "TextSegment.Builder":
7776
return self
7877

7978
def set_text(self, text: str) -> "TextSegment.Builder":
80-
self._text_segment._text = GlyphString(text)
79+
self._text_segment._text = text
8180
return self
8281

8382
def build(self) -> "TextSegment":
8483
return self._text_segment
85-
86-
87-
class GlyphString:
88-
def __init__(self, string: str) -> None:
89-
self._string = string
90-
self._string_index_by_glyph_index = {
91-
glyph_index: string_index
92-
for glyph_index, string_index in enumerate(
93-
[i for i, c in enumerate(string) if unicodedata.category(c) not in ["Mc", "Mn"]]
94-
)
95-
}
96-
97-
def __len__(self) -> int:
98-
return len(self._string_index_by_glyph_index)
99-
100-
def __str__(self):
101-
return self._string
102-
103-
def __eq__(self, other) -> bool:
104-
if not isinstance(other, GlyphString):
105-
return False
106-
return self._string == other._string
107-
108-
def __getitem__(self, key) -> "GlyphString":
109-
if isinstance(key, int):
110-
glyph_start = self._normalize_start_index(key)
111-
glyph_stop = self._normalize_stop_index(glyph_start + 1)
112-
string_start = self._string_index_by_glyph_index.get(glyph_start, len(self))
113-
string_stop = self._string_index_by_glyph_index.get(glyph_stop, None)
114-
return GlyphString(self._string[string_start:string_stop])
115-
elif isinstance(key, slice):
116-
if key.step is not None and key.step != 1:
117-
raise TypeError("Steps are not allowed in _glyphString slices")
118-
glyph_start = self._normalize_start_index(key.start)
119-
glyph_stop = self._normalize_stop_index(key.stop)
120-
string_start = self._string_index_by_glyph_index.get(glyph_start, len(self))
121-
string_stop = self._string_index_by_glyph_index.get(glyph_stop, None)
122-
return GlyphString(self._string[string_start:string_stop])
123-
else:
124-
raise TypeError("Indices must be integers or slices")
125-
126-
def _normalize_start_index(self, index: Optional[int]) -> int:
127-
if index is None:
128-
return 0
129-
if index < 0:
130-
return len(self) + index
131-
return index
132-
133-
def _normalize_stop_index(self, index: Optional[int]) -> int:
134-
if index is None:
135-
return len(self)
136-
if index < 0:
137-
return len(self) + index
138-
return index
139-
140-
def string_index_to_glyph_index(self, string_index: int) -> int:
141-
if string_index == len(self._string):
142-
return len(self)
143-
for g_index, s_index in self._string_index_by_glyph_index.items():
144-
if s_index == string_index:
145-
return g_index
146-
raise ValueError(f"No corresponding glyph index found for string index {string_index}.")

tests/corpora/test_quote_convention_changing_usfm_block_update_handler.py

Lines changed: 10 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -476,13 +476,9 @@ def test_process_scripture_element() -> None:
476476

477477
assert quote_convention_changer._quotation_mark_finder.num_times_called == 1
478478
assert mock_quotation_mark_resolver.num_times_called == 1
479+
assert quote_convention_changer._quotation_mark_finder.matches_to_return[0]._text_segment.text == "this is a ‘test"
479480
assert (
480-
str(quote_convention_changer._quotation_mark_finder.matches_to_return[0]._text_segment.text)
481-
== "this is a ‘test"
482-
)
483-
assert (
484-
str(quote_convention_changer._quotation_mark_finder.matches_to_return[1]._text_segment.text)
485-
== "the test ends” here"
481+
quote_convention_changer._quotation_mark_finder.matches_to_return[1]._text_segment.text == "the test ends” here"
486482
)
487483

488484

@@ -497,7 +493,7 @@ def test_create_text_segments_basic() -> None:
497493
text_segments: List[TextSegment] = quote_convention_changer._create_text_segments(update_element)
498494

499495
assert len(text_segments) == 1
500-
assert str(text_segments[0].text) == "test segment"
496+
assert text_segments[0].text == "test segment"
501497
assert text_segments[0]._immediate_preceding_marker is UsfmMarkerType.NO_MARKER
502498
assert text_segments[0]._markers_in_preceding_context == set()
503499
assert text_segments[0].previous_segment is None
@@ -520,7 +516,7 @@ def test_create_text_segments_with_preceding_markers() -> None:
520516
text_segments: List[TextSegment] = quote_convention_changer._create_text_segments(update_element)
521517

522518
assert len(text_segments) == 1
523-
assert str(text_segments[0].text) == "test segment"
519+
assert text_segments[0].text == "test segment"
524520
assert text_segments[0]._immediate_preceding_marker == UsfmMarkerType.PARAGRAPH
525521
assert text_segments[0]._markers_in_preceding_context == {
526522
UsfmMarkerType.VERSE,
@@ -550,15 +546,15 @@ def test_create_text_segments_with_multiple_text_tokens() -> None:
550546
text_segments: List[TextSegment] = quote_convention_changer._create_text_segments(update_element)
551547

552548
assert len(text_segments) == 2
553-
assert str(text_segments[0].text) == "test segment1"
549+
assert text_segments[0].text == "test segment1"
554550
assert text_segments[0]._immediate_preceding_marker == UsfmMarkerType.PARAGRAPH
555551
assert text_segments[0]._markers_in_preceding_context == {
556552
UsfmMarkerType.VERSE,
557553
UsfmMarkerType.PARAGRAPH,
558554
}
559555
assert text_segments[0].previous_segment is None
560556
assert text_segments[0].next_segment == text_segments[1]
561-
assert str(text_segments[1].text) == "test segment2"
557+
assert text_segments[1].text == "test segment2"
562558
assert text_segments[1]._immediate_preceding_marker == UsfmMarkerType.CHARACTER
563559
assert text_segments[1]._markers_in_preceding_context == {
564560
UsfmMarkerType.VERSE,
@@ -577,7 +573,7 @@ def test_create_text_segment() -> None:
577573
segment: Union[TextSegment, None] = quote_convention_changer._create_text_segment(usfm_token)
578574

579575
assert segment is not None
580-
assert str(segment.text) == "test segment"
576+
assert segment.text == "test segment"
581577
assert segment._immediate_preceding_marker is UsfmMarkerType.NO_MARKER
582578
assert segment._markers_in_preceding_context == set()
583579
assert segment._usfm_token == usfm_token
@@ -647,7 +643,7 @@ def test_update_quotation_marks() -> None:
647643

648644
multi_char_to_single_char_quote_convention_changer._update_quotation_marks(multi_character_quotation_marks)
649645

650-
assert str(multi_character_text_segment.text) == "this “is ‘a test segment’ ”"
646+
assert multi_character_text_segment.text == "this “is ‘a test segment’ ”"
651647

652648
assert multi_character_quotation_marks[0].start_index == 5
653649
assert multi_character_quotation_marks[0].end_index == 6
@@ -707,7 +703,7 @@ def test_update_quotation_marks() -> None:
707703

708704
single_char_to_multi_char_quote_convention_changer._update_quotation_marks(single_character_quotation_marks)
709705

710-
assert str(single_character_text_segment.text) == "this <<is <a test segment> >>"
706+
assert single_character_text_segment.text == "this <<is <a test segment> >>"
711707

712708
assert single_character_quotation_marks[0].start_index == 5
713709
assert single_character_quotation_marks[0].end_index == 7
@@ -768,7 +764,7 @@ def test_start_new_chapter() -> None:
768764
segment = quote_convention_changer._next_scripture_text_segment_builder.build()
769765
assert quote_convention_changer._current_strategy == QuotationMarkUpdateStrategy.SKIP
770766
assert segment._immediate_preceding_marker == UsfmMarkerType.CHAPTER
771-
assert str(segment.text) == ""
767+
assert segment.text == ""
772768
assert UsfmMarkerType.EMBED not in segment._markers_in_preceding_context
773769
assert quote_convention_changer._verse_text_quotation_mark_resolver._issues == set()
774770

tests/punctuation_analysis/test_quotation_mark_finder.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,7 @@ def test_that_all_possible_quotation_marks_are_identified() -> None:
177177

178178
assert quotation_mark_finder.find_all_potential_quotation_marks_in_text_segment(
179179
TextSegment.Builder().set_text('उत्पत्ति "पुस्तकले').build()
180-
) == [QuotationMarkStringMatch(TextSegment.Builder().set_text('उत्पत्ति "पुस्तकले').build(), 6, 7)]
180+
) == [QuotationMarkStringMatch(TextSegment.Builder().set_text('उत्पत्ति "पुस्तकले').build(), 9, 10)]
181181

182182

183183
def test_that_it_uses_the_quote_convention_set() -> None:

tests/punctuation_analysis/test_quotation_mark_metadata.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ def test_update_quotation_mark() -> None:
1919
end_index=23,
2020
)
2121
quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("standard_english"))
22-
assert str(quotation_mark_metadata.text_segment.text) == "He said to the woman, “Has God really said,"
22+
assert quotation_mark_metadata.text_segment.text == "He said to the woman, “Has God really said,"
2323

2424
quotation_mark_metadata = QuotationMarkMetadata(
2525
quotation_mark='"',
@@ -30,7 +30,7 @@ def test_update_quotation_mark() -> None:
3030
end_index=23,
3131
)
3232
quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("western_european"))
33-
assert str(quotation_mark_metadata.text_segment.text) == "He said to the woman, «Has God really said,"
33+
assert quotation_mark_metadata.text_segment.text == "He said to the woman, «Has God really said,"
3434

3535
quotation_mark_metadata = QuotationMarkMetadata(
3636
quotation_mark='"',
@@ -41,7 +41,7 @@ def test_update_quotation_mark() -> None:
4141
end_index=24,
4242
)
4343
quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("western_european"))
44-
assert str(quotation_mark_metadata.text_segment.text) == 'He said to the woman, "«as God really said,'
44+
assert quotation_mark_metadata.text_segment.text == 'He said to the woman, "«as God really said,'
4545

4646

4747
def test_update_quotation_mark_with_multi_character_quotation_marks() -> None:
@@ -54,7 +54,7 @@ def test_update_quotation_mark_with_multi_character_quotation_marks() -> None:
5454
end_index=23,
5555
)
5656
quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("typewriter_french"))
57-
assert str(quotation_mark_metadata.text_segment.text) == "He said to the woman, <<Has God really said,"
57+
assert quotation_mark_metadata.text_segment.text == "He said to the woman, <<Has God really said,"
5858
assert quotation_mark_metadata.start_index == 22
5959
assert quotation_mark_metadata.end_index == 24
6060

@@ -67,7 +67,7 @@ def test_update_quotation_mark_with_multi_character_quotation_marks() -> None:
6767
end_index=24,
6868
)
6969
quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("standard_english"))
70-
assert str(quotation_mark_metadata.text_segment.text) == "He said to the woman, “Has God really said,"
70+
assert quotation_mark_metadata.text_segment.text == "He said to the woman, “Has God really said,"
7171
assert quotation_mark_metadata.start_index == 22
7272
assert quotation_mark_metadata.end_index == 23
7373

tests/punctuation_analysis/test_quotation_mark_string_match.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ def test_get_previous_character() -> None:
129129
0,
130130
1,
131131
)
132-
assert quotation_mark_string_match.previous_character == "ले"
132+
assert quotation_mark_string_match.previous_character == "\u0947"
133133

134134

135135
def test_get_next_character() -> None:

tests/punctuation_analysis/test_text_segment.py

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
def test_builder_initialization() -> None:
66
builder = TextSegment.Builder()
77

8-
assert str(builder._text_segment.text) == ""
8+
assert builder._text_segment.text == ""
99
assert builder._text_segment.previous_segment is None
1010
assert builder._text_segment.next_segment is None
1111
assert builder._text_segment._immediate_preceding_marker is UsfmMarkerType.NO_MARKER
@@ -20,7 +20,7 @@ def test_builder_set_text() -> None:
2020
text = "Example text"
2121
builder.set_text(text)
2222

23-
assert str(builder._text_segment.text) == text
23+
assert builder._text_segment.text == text
2424

2525

2626
def test_builder_set_previous_segment() -> None:
@@ -62,7 +62,7 @@ def test_builder_set_usfm_token() -> None:
6262
assert builder._text_segment._usfm_token is not None
6363
assert builder._text_segment._usfm_token.type == UsfmTokenType.TEXT
6464
assert builder._text_segment._usfm_token.text == "USFM token text"
65-
assert str(builder._text_segment.text) == ""
65+
assert builder._text_segment.text == ""
6666
assert builder._text_segment.previous_segment is None
6767
assert builder._text_segment.next_segment is None
6868

@@ -148,10 +148,10 @@ def test_equals() -> None:
148148

149149
def test_get_text() -> None:
150150
text_segment = TextSegment.Builder().set_text("example text").build()
151-
assert str(text_segment.text) == "example text"
151+
assert text_segment.text == "example text"
152152

153153
text_segment = TextSegment.Builder().set_text("new example text").build()
154-
assert str(text_segment.text) == "new example text"
154+
assert text_segment.text == "new example text"
155155

156156

157157
def test_length() -> None:
@@ -163,7 +163,7 @@ def test_length() -> None:
163163

164164
# Combining characters
165165
text_segment = TextSegment.Builder().set_text("उत्पत्ति पुस्तकले").build()
166-
assert text_segment.length == 11
166+
assert text_segment.length == 17
167167

168168
# Surrogate pairs
169169
text_segment = TextSegment.Builder().set_text("𝜺𝜺").build()
@@ -251,28 +251,28 @@ def test_is_last_segment_in_verse() -> None:
251251
def test_replace_substring() -> None:
252252
text_segment = TextSegment.Builder().set_text("example text").build()
253253
text_segment.replace_substring(0, 7, "sample")
254-
assert str(text_segment.text) == "sample text"
254+
assert text_segment.text == "sample text"
255255

256256
text_segment.replace_substring(7, 11, "text")
257-
assert str(text_segment.text) == "sample text"
257+
assert text_segment.text == "sample text"
258258

259259
text_segment.replace_substring(0, 7, "")
260-
assert str(text_segment.text) == "text"
260+
assert text_segment.text == "text"
261261

262262
text_segment.replace_substring(0, 4, "new'")
263-
assert str(text_segment.text) == "new'"
263+
assert text_segment.text == "new'"
264264

265265
text_segment.replace_substring(3, 4, "\u2019")
266-
assert str(text_segment.text) == "new\u2019"
266+
assert text_segment.text == "new\u2019"
267267

268268
text_segment.replace_substring(0, 0, "prefix ")
269-
assert str(text_segment.text) == "prefix new\u2019"
269+
assert text_segment.text == "prefix new\u2019"
270270

271271
text_segment.replace_substring(0, 0, "")
272-
assert str(text_segment.text) == "prefix new\u2019"
272+
assert text_segment.text == "prefix new\u2019"
273273

274274
text_segment.replace_substring(11, 11, " suffix")
275-
assert str(text_segment.text) == "prefix new\u2019 suffix"
275+
assert text_segment.text == "prefix new\u2019 suffix"
276276

277277
text_segment.replace_substring(6, 6, "-")
278-
assert str(text_segment.text) == "prefix- new\u2019 suffix"
278+
assert text_segment.text == "prefix- new\u2019 suffix"

0 commit comments

Comments
 (0)