-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpostprocess-XML.py
143 lines (125 loc) · 5.26 KB
/
postprocess-XML.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/bin/env python
# coding: utf-8
from pathlib import Path
import re
import json
# Re-insert metadata from pre_output/tmp file into XML as tags.
def insert_metatags(text, meta_data):
lines = text.split('\n')
for num, s in enumerate(lines):
if '$' in s:
new = ''
for chunk in re.split(r'(\$\d+\s*)', lines[num]):
if '$' in chunk:
g_id = re.findall(r'\$\d+', chunk)
new += '<milestone xml:id="'
new += meta_data["milestones"][g_id[0]]
new += '"/>'
else:
new += chunk
lines[num] = new
if '#' in s:
new = ''
for chunk in re.split(r'(#\d+)', lines[num]):
if chunk.startswith('#'):
g_id = re.findall(r'#\d+', chunk)
new += '<note xml:id="'
new += meta_data["notes"][g_id[0]]
new += '"/>'
else:
new += chunk
lines[num] = new
if '<text' in s:
grab_id = meta_data["milestones"]["$1"]
grab_id = re.sub(r'(UT22084-\d+-\d+)-\d+', r'\1', grab_id)
new = '<text text-version="'
new += meta_data["text_version"]["text_version"]
new += '" text-id="'
new += grab_id
new += '">'
lines[num] = new
lines = '\n'.join(lines)
return lines
# Convert flags made by TM editors for alternative sources and dubious
# translations into XML tags.
def create_flags_bo(text):
lines = text.split('\n')
for num, s in enumerate(lines):
if re.search(r'>\s*!', s):
lines[num] = re.sub(r'>\s*!\s*', ' flag="alternateSource">', lines[num])
lines = '\n'.join(lines)
return lines
def create_flags_en(text):
lines = text.split('\n')
for num, s in enumerate(lines):
if re.search(r'>\s*%', s):
lines[num] = re.sub(r'>\s*%\s*', ' flag="dubiousTranslation">', lines[num])
lines = '\n'.join(lines)
return lines
# Normalize Tibetan; remove spaces created by pybo and reformat folio refs into
# XML tags
def normalize_tibetan(text):
segments = re.split(r'(>.*?<)', text)
for num, s in enumerate(segments):
if re.search('་', s): # Find Tibetan strings according to Tsegs
s2 = re.sub(r'\[(\d+)\.?([ab])]\s?', r'<ref folio="F.\1.\2"/>', s)
s3 = re.sub(r' (?![a-z])', '', s2)
segments[num] = re.sub('_', ' ', s3)
segments = ''.join(segments)
return segments
# Remove Folio Refs in English since they are already marked in the Tibetan
# and they might not line up exactly to the points where the translators have
# placed them.
def remove_folio_refs_en(text):
lines = text.split('\n')
for num, s in enumerate(lines):
if re.search(r'\[\d+\.?[ab]\s?', s):
lines[num] = re.sub(r'\[\d+\.?[ab]]\s?', '', s)
lines = '\n'.join(lines)
return lines
# This is the primary function in the script to process all XML files exported
# from InterText in directory "post_input" and write to "post_output".
def postprocess(in_dir, out_dir):
# get path for XML in post_input
for file in in_dir.glob('*.xml'):
if file.is_file():
if 'bo.en' in file.name:
# copy over alignment XML file
copy = file.read_text(encoding='utf-8-sig')
to_file = out_dir / file.name
to_file.write_text(copy, encoding='utf-8-sig')
if '.bo' in file.name and '.en' not in file.name:
# process source XML file
text = file.read_text(encoding='utf-8-sig')
text = create_flags_bo(text)
text = normalize_tibetan(text)
to_file = out_dir / file.name
to_file.write_text(text, encoding='utf-8-sig')
if 'en' in file.name and '.bo' not in file.name:
# process translation XML file
# get filestem to access .json file from 'output/tmp' folder
file_stem = str(file.name).replace('.en.xml','')
# note, rstrip removes any characters in argument string, but this
# works for 84000 project because all input files end with a number.
json_file = in_dir.parent / 'pre_output' / 'tmp' / (file_stem + '-bo.json')
if not json_file.is_file():
exit(f'{json_file} is missing.\nExiting')
read_meta_data = json_file.read_text(encoding='utf-8-sig')
meta_data = json.loads(read_meta_data)
text = file.read_text(encoding='utf-8-sig')
text = insert_metatags(text, meta_data)
text = create_flags_en(text)
text = remove_folio_refs_en(text)
to_file = out_dir / file.name
to_file.write_text(text, encoding='utf-8-sig')
if __name__ == '__main__':
in_dir = Path('post_input')
in_dir.mkdir(exist_ok=True)
out_dir = Path('post_output')
out_dir.mkdir(exist_ok=True, parents=True)
# empty post_output folder at each run
for o in out_dir.parent.glob('*.xml'):
o.unlink()
for o in out_dir.glob('*.xml'):
o.unlink()
postprocess(in_dir, out_dir)