-
Notifications
You must be signed in to change notification settings - Fork 0
/
ords_deepl_4backfill.py
178 lines (154 loc) · 6.08 KB
/
ords_deepl_4backfill.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
#!/usr/bin/env python3
"""
Series of scripts for translating ORDS `problem` text.
https://github.com/DeepLcom/deepl-python
Step 1: ords_deepl_1setup.py
Table created, MySQL database required.
Step 2: ords_deepl_2fetch.py
Compiles workload, translates, DeepL API key required.
Step 3: ords_deepl_3check.py
Inspect data integrity.
Step 4: ords_deepl_4backfill.py
Translate missing values for given languages.
"""
import deepl
import pandas as pd
from funcs import *
dbfuncs.dbvars = cfg.get_dbvars()
def find_existing_translation_for_col(problem, column):
sql = f"""
SELECT problem, {column}, COUNT(*) as records
FROM ords_problem_translations
WHERE problem = %(problem)s
AND {column} IS NOT NULL
GROUP BY problem, {column}
HAVING records > 1
ORDER BY records DESC
LIMIT 1;
"""
work = pd.DataFrame(dbfuncs.mysql_query_fetchall(sql, {"problem": problem}))
return work
# Fetch problem text that has not yet been translated.
# Ignore the more useless values.
# Arg 'cols' is a list of columns to check.
def get_work_for_null_lang_vals(cols, max=100):
try:
print("*** FETCHING WORK FOR EMPTY LANGUAGE VALUES ***")
sql = """SELECT *
FROM ords_problem_translations
WHERE language_known <> '??'
AND CONCAT({}) IS NULL
LIMIT {}
""".format(
",".join(cols), max
)
logger.debug(sql)
work = pd.DataFrame(dbfuncs.mysql_query_fetchall(sql))
except Exception as error:
print(f"Exception: {error}")
work = pd.DataFrame()
finally:
return work
def translate_empty_only(data, langdict):
try:
# For each record fetch a translation for each target language where empty.
for i, row in data.iterrows():
d_lang = row.language_known
for column in langdict.keys():
# Is there already a translation for this text?
found = find_existing_translation_for_col(row.problem, column)
if found.empty:
# No existing translation so fetch from API.
# The "detected" language is the known language
if row[column] == None:
t_lang = langdict[column]
print(f"{i} : {row.id_ords} : {t_lang}")
# Has a language been detected for this problem?
# Is the target language the same as the detected language?
if d_lang == t_lang:
# Don't use up API credits.
text = row.problem
else:
# No existing translation so fetch from API.
logger.debug(f"{row.id_ords} is new... translating")
try:
result = translator.translate_text(
row.problem,
target_lang=t_lang,
source_lang=row.language_known,
)
text = result.text
except deepl.DeepLException as error:
print(f"Exception: {error}")
return data
data.at[i, column] = text
else:
# Translation exists so copy from existing.
logger.debug(f"{row.id_ords} exists... copying")
data.at[i, column] = found[column].values[0]
except Exception as error:
print(f"Exception: {error}")
finally:
return data
# Default: replace entire row.
# Or single element list e.g.: ['fr'], replace one column only.
def insert_data(data, columns=[]):
tablename = "ords_problem_translations"
if data.empty:
print("No data to write.")
return False
if len(columns) == 1:
column = columns.pop()
cfile = f"{cfg.OUT_DIR}/deepl_backfilled_lang_{column}.csv"
vals = list(zip(data[column], data["id_ords"]))
sql = f"""UPDATE {tablename} SET `{column}`=%s WHERE id_ords=%s"""
result = dbfuncs.mysql_executemany(sql, vals)
else:
cfile = f"{cfg.OUT_DIR}/deepl_backfilled_lang_all.csv"
vals = list(zip(*[data[col] for col in data]))
logger.debug(vals)
sql = """REPLACE INTO `{}` (`{}`) VALUES ({})""".format(
tablename, "`,`".join(data.columns), ",".join(["%s"] * len(data.columns))
)
logger.debug(sql)
result = dbfuncs.mysql_executemany(sql, vals)
logger.debug(f"{result} updated in {tablename}")
pathfuncs.rm_file(cfile)
data.to_csv(cfile, index=False)
print(f"New data written to {cfile}")
return True
if __name__ == "__main__":
logger = cfg.init_logger(__file__)
# Allows for trial and error without using up API credits.
# Should create a test and use mock there, ideally.
mock = True
translator = deeplfuncs.deeplWrapper(mock)
limit_reached = translator.api_limit_reached()
"""
Get the columns to check for NULL values. Examples:
Just one column.
columns = ['da']
Selection of columns.
columns = ['it','es']
All columns.
columns = deeplfuncs.deeplWrapper.get_columns()
All but the last column.
columns = [x for x in deeplfuncs.deeplWrapper.get_columns()[:-1]]
"""
# Backfilling for all
columns = deeplfuncs.deeplWrapper.get_columns()
limit = 10000
work = get_work_for_null_lang_vals(columns, limit)
work.to_csv(f"{cfg.OUT_DIR}/deepl_backfill_work.csv", index=False)
if limit_reached:
exit()
else:
# Backfilling any empty columns: deeplfuncs.deeplWrapper.langdict
# Backfilling one column only, e.g. Danish: {'da':'da'}
data = translate_empty_only(work, deeplfuncs.deeplWrapper.langdict)
data.to_csv(f"{cfg.OUT_DIR}/deepl_backfill_latest.csv", index=False)
if not mock:
insert_data(data, columns)
dbfuncs.dump_table_to_csv("ords_problem_translations", cfg.DATA_DIR)
else:
logger.debug(data)