Skip to content

Commit 89aa16c

Browse files
committed
feat: summarize g2p errors in assemble and on the CLI
In assemble, just list the words that could not be g2p'd, so the error message shown to the user in Studio-Web is legible. In the CLI, add a summary list of the words that could not be g2p's in the error message.
1 parent 34c76da commit 89aa16c

File tree

7 files changed

+45
-21
lines changed

7 files changed

+45
-21
lines changed

readalongs/align_utils.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ def parse_and_make_xml(
5858
xml = add_ids(xml)
5959
if save_temps is not None:
6060
save_xml(save_temps + ".ids.readalong", xml)
61-
xml, valid = convert_xml(
61+
xml, valid, non_convertible_words = convert_xml(
6262
xml,
6363
verbose_warnings=verbose_g2p_warnings,
6464
output_orthography=output_orthography,
@@ -67,8 +67,10 @@ def parse_and_make_xml(
6767
save_xml(save_temps + ".g2p.readalong", xml)
6868
if not valid:
6969
raise RuntimeError(
70-
"Some words could not be g2p'd correctly. Aborting. "
71-
"Run with --debug-g2p for more detailed g2p error logs."
70+
"These words could not be g2p'd correctly: '"
71+
+ "', '".join(non_convertible_words)
72+
+ "'. Aborting. "
73+
+ "Run with --debug-g2p for more detailed g2p error logs."
7274
)
7375
return xml
7476

readalongs/cli.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -749,7 +749,9 @@ def g2p(**kwargs):
749749
xml = add_ids(xml)
750750

751751
# Apply the g2p mappings.
752-
xml, valid = convert_xml(xml, verbose_warnings=kwargs["debug_g2p"])
752+
xml, valid, non_convertible_words = convert_xml(
753+
xml, verbose_warnings=kwargs["debug_g2p"]
754+
)
753755

754756
if output_path == "-":
755757
write_xml(sys.stdout.buffer, xml)
@@ -759,7 +761,9 @@ def g2p(**kwargs):
759761

760762
if not valid:
761763
LOGGER.error(
762-
"Some word(s) could not be g2p'd correctly."
764+
"These word(s) could not be g2p'd correctly: '"
765+
+ "', '".join(non_convertible_words)
766+
+ "'."
763767
+ (
764768
" Run again with --debug-g2p to get more detailed error messages."
765769
if not kwargs["debug_g2p"]

readalongs/text/convert_xml.py

+7-4
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,7 @@ def convert_word_with_cascade(
205205
all_g2p_valid = True
206206
if start_time is None:
207207
start_time = perf_counter()
208+
non_convertible_words: Dict[str, bool] = {} # dict used as ordered set
208209
for i, word in enumerate(xml.xpath(".//" + word_unit)):
209210
if time_limit is not None and perf_counter() - start_time > time_limit:
210211
raise TimeLimitException(
@@ -234,6 +235,7 @@ def convert_word_with_cascade(
234235
text_to_g2p, g2p_lang, g2p_fallbacks
235236
)
236237
if not valid:
238+
non_convertible_words[text_to_g2p] = True
237239
all_g2p_valid = False
238240
if effective_g2p_lang:
239241
word.attrib["effective-g2p-lang"] = effective_g2p_lang
@@ -256,7 +258,7 @@ def convert_word_with_cascade(
256258

257259
word.attrib["ARPABET"] = all_arpabet.strip()
258260

259-
return xml, all_g2p_valid
261+
return xml, all_g2p_valid, list(non_convertible_words.keys())
260262

261263

262264
def convert_xml(
@@ -280,21 +282,22 @@ def convert_xml(
280282
of the calling process
281283
282284
Returns:
283-
xml (etree), valid (bool):
285+
xml (etree), valid (bool), non_convertible_words (list[str]):
284286
- xml is a deepcopy of the input xml with the ARPABET attribute added
285287
to each word_unit element;
286288
- valid is a flag indicating whether all words were g2p'd successfully
289+
- non_convertible_words is a list of words that could not be g2p converted
287290
288291
Raises:
289292
TimeLimitException: if the time_limit is specified and exceeded
290293
"""
291294
xml_copy = copy.deepcopy(xml)
292-
xml_copy, valid = convert_words(
295+
xml_copy, valid, non_convertible_words = convert_words(
293296
xml_copy,
294297
word_unit,
295298
output_orthography,
296299
verbose_warnings,
297300
time_limit,
298301
start_time,
299302
)
300-
return xml_copy, valid
303+
return xml_copy, valid, non_convertible_words

readalongs/text/make_package.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ def encode_from_path(path: Union[str, os.PathLike]) -> str:
116116
return f"data:{mime_type};base64,{b64}"
117117

118118

119-
def fetch_bundle_file(url, filename, prev_status_code):
119+
def fetch_bundle_file(url: str, filename: str, prev_status_code: Any):
120120
"""Fetch either of the online bundles, or their on-disk fallback if needed."""
121121
import requests # Defer expensive import
122122

readalongs/web_api.py

+11-3
Original file line numberDiff line numberDiff line change
@@ -259,7 +259,7 @@ async def assemble(
259259

260260
# g2p
261261
try:
262-
g2ped, valid = convert_xml(
262+
g2ped, valid, non_convertible_words = convert_xml(
263263
ids_added,
264264
start_time=start_time,
265265
time_limit=ASSEMBLE_TIME_LIMIT_IN_SECONDS,
@@ -268,10 +268,18 @@ async def assemble(
268268
raise HTTPException(status_code=422, detail=str(e)) from e
269269

270270
if not valid:
271+
if non_convertible_words:
272+
logs = (
273+
"These words could not be converted from text to phonemes by g2p: '"
274+
+ "', '".join(non_convertible_words)
275+
+ "'."
276+
)
277+
else:
278+
logs = "Logs: " + captured_logs.getvalue()
271279
raise HTTPException(
272280
status_code=422,
273-
detail="g2p could not be performed, please check your text or your language code. Logs: "
274-
+ captured_logs.getvalue(),
281+
detail="g2p could not be performed, please check your text or your language code. "
282+
+ logs,
275283
)
276284
# create grammar
277285
dict_data, text_input = create_grammar(g2ped)

test/test_g2p_cli.py

+10-3
Original file line numberDiff line numberDiff line change
@@ -449,15 +449,15 @@ def test_convert_xml_subwords(self):
449449
def test_convert_xml_invalid(self):
450450
"""test readalongs.text.convert_xml.convert_xml() with invalid input"""
451451
xml = parse_xml('<s><w ARPABET="V AA L IY D">valid</w></s>')
452-
c_xml, valid = convert_xml(xml)
452+
c_xml, valid, _ = convert_xml(xml)
453453
self.assertEqual(
454454
etree.tounicode(c_xml), '<s><w ARPABET="V AA L IY D">valid</w></s>'
455455
)
456456
self.assertTrue(valid, "convert_xml with valid pre-g2p'd text")
457457

458458
xml = parse_xml('<s><w ARPABET="invalid">invalid</w></s>')
459459
with redirect_stderr(StringIO()):
460-
c_xml, valid = convert_xml(xml)
460+
c_xml, valid, _ = convert_xml(xml)
461461
self.assertEqual(
462462
etree.tounicode(c_xml), '<s><w ARPABET="invalid">invalid</w></s>'
463463
)
@@ -473,13 +473,20 @@ def test_invalid_langs_in_xml(self):
473473
"""
474474
)
475475
with self.assertLogs(LOGGER, level="WARNING") as cm:
476-
c_xml, valid = convert_xml(xml, verbose_warnings=True)
476+
c_xml, valid, _ = convert_xml(xml, verbose_warnings=True)
477477
self.assertFalse(valid)
478478
logger_output = "\n".join(cm.output)
479479
self.assertIn("No lang", logger_output)
480480
self.assertIn("foo", logger_output)
481481
self.assertIn('no path from "crx-syl"', logger_output)
482482

483+
def test_non_convertible_words(self):
484+
xml = parse_xml("<s><w>43:23</w><w>65:67</w><w>43:23</w></s>")
485+
with self.assertLogs(LOGGER, level="WARNING"):
486+
g2p_xml, valid, non_convertible_words = convert_xml(xml)
487+
self.assertFalse(valid)
488+
self.assertEqual(non_convertible_words, ["43:23", "65:67"])
489+
483490

484491
if __name__ == "__main__":
485492
main()

test/test_web_api.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ def test_create_grammar(self):
105105
with redirect_stderr(StringIO()):
106106
tokenized = tokenize_xml(parsed)
107107
ids_added = add_ids(tokenized)
108-
g2ped, valid = convert_xml(ids_added)
108+
g2ped, valid, _ = convert_xml(ids_added)
109109

110110
word_dict, text = create_grammar(g2ped)
111111
self.assertTrue(valid)
@@ -159,11 +159,11 @@ def test_convert_time_limit(self):
159159
ids_added, time_limit=1.001, start_time=perf_counter() - 1.0
160160
)
161161
# Lots of time, should not raise
162-
_, valid = convert_xml(
162+
_, valid, _ = convert_xml(
163163
ids_added, time_limit=100, start_time=perf_counter() - 1.0
164164
)
165165
self.assertTrue(valid)
166-
_, valid = convert_xml(ids_added, time_limit=100)
166+
_, valid, _ = convert_xml(ids_added, time_limit=100)
167167
self.assertTrue(valid)
168168

169169
def test_bad_g2p(self):
@@ -189,7 +189,7 @@ def test_g2p_faiture(self):
189189
response = self.API_CLIENT.post("/api/v1/assemble", json=request)
190190
self.assertEqual(response.status_code, 422)
191191
content = response.json()
192-
self.assertIn("No valid g2p conversion", content["detail"])
192+
self.assertIn("These words could not", content["detail"])
193193

194194
def test_no_words(self):
195195
# Test the assemble endpoint with no actual words in the text
@@ -216,7 +216,7 @@ def test_empty_g2p(self):
216216
response = self.API_CLIENT.post("/api/v1/assemble", json=request)
217217
self.assertEqual(response.status_code, 422)
218218
content_log = response.json()["detail"]
219-
for message_part in ["The output of the g2p process", "24", "23", "is empty"]:
219+
for message_part in ["These words could not", "24", "23"]:
220220
self.assertIn(message_part, content_log)
221221

222222
def test_langs(self):

0 commit comments

Comments
 (0)