-
Notifications
You must be signed in to change notification settings - Fork 0
/
ords_deepl_3check.py
110 lines (98 loc) · 3.72 KB
/
ords_deepl_3check.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/env python3
"""
Series of scripts for translating ORDS `problem` text.
https://github.com/DeepLcom/deepl-python
Step 1: ords_deepl_1setup.py
Table created, MySQL database required.
Step 2: ords_deepl_2fetch.py
Compiles workload, translates, DeepL API key required.
Step 3: ords_deepl_3check.py
Inspect data integrity.
Step 4: ords_deepl_4backfill.py
Translate missing values for given languages.
"""
import pandas as pd
from funcs import *
dbfuncs.dbvars = cfg.get_dbvars()
if __name__ == "__main__":
logger = cfg.init_logger(__file__)
# Language detection stats.
logger.debug("*** DETECTED ***")
sql = """
SELECT language_detected, COUNT(*) as records
FROM `ords_problem_translations`
GROUP BY language_detected
ORDER BY records DESC
"""
df = pd.DataFrame(dbfuncs.mysql_query_fetchall(sql))
logger.debug(df)
# Outlier languages detected.
logger.debug("*** UNKNOWN LANGUAGE DETECTED***")
sql = """
SELECT language_known, language_detected, COUNT(*) as records
FROM `ords_problem_translations`
WHERE language_detected NOT IN ('??', 'en', 'de', 'nl', 'fr', 'it', 'es', 'da')
GROUP BY language_known, language_detected
ORDER BY records DESC
"""
df = pd.DataFrame(dbfuncs.mysql_query_fetchall(sql))
logger.debug(df)
# Detected language does not match "known" language.
# Note that "known" language could be incorrect.
# Log summary and write to csv file.
logger.debug("*** MISMATCHED LANGUAGE DETECTION ***")
path = f"{cfg.OUT_DIR}/deepl_misdetect.csv"
logger.debug("See " + path)
sql = """
SELECT language_known, language_detected, COUNT(*) as records
FROM `ords_problem_translations`
WHERE language_detected != language_known
GROUP BY language_known, language_detected
ORDER BY records DESC
"""
df = pd.DataFrame(dbfuncs.mysql_query_fetchall(sql))
logger.debug(df)
sql = """
SELECT id_ords, language_known, language_detected, problem
FROM `ords_problem_translations`
WHERE language_detected != language_known
ORDER BY language_known, language_detected
"""
df = pd.DataFrame(dbfuncs.mysql_query_fetchall(sql))
df.to_csv(path, index=False)
# Identical translations across languages.
# Could be bad language detected or malformed problem text.
# Write results to csv file.
logger.debug("*** IDENTICAL TRANSLATIONS ***")
path = f"{cfg.OUT_DIR}/deepl_mistranslate.csv"
logger.debug("See " + path)
sql = """
SELECT id_ords, language_known, language_detected,
en, de, nl, fr, it, es, da
FROM `ords_problem_translations`
WHERE language_detected <> '??'
AND (`en` = `problem`
AND `de` = `problem`
AND `nl` = `problem`
AND `fr` = `problem`
AND `it` = `problem`
AND `es` = `problem`
AND `da` = `problem`)
"""
df = pd.DataFrame(dbfuncs.mysql_query_fetchall(sql))
df.to_csv(path, index=False)
# Missing translations across languages.
# Could have run out of DeepL credits before lang set completion.
# Write results to csv file.
logger.debug("*** MISSING TRANSLATIONS ***")
path = f"{cfg.OUT_DIR}/deepl_missing.csv"
logger.debug("See " + path)
sql = """
SELECT id_ords, language_known, language_detected,
en, de, nl, fr, it, es, da
FROM `ords_problem_translations`
WHERE CONCAT(`en`,`de`,`nl`,`fr`,`it`,`es`,`da`) IS NULL
OR (`en` = '' OR `de` = '' OR `nl` = '' OR `fr` = '' OR `it` = '' OR `es` = '' OR `da` = '');
"""
df = pd.DataFrame(dbfuncs.mysql_query_fetchall(sql))
df.to_csv(path, index=False)