-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpostprocess-TMX.py
157 lines (136 loc) · 5.78 KB
/
postprocess-TMX.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#!/usr/bin/env python
# coding: utf-8
from pathlib import Path
import re
import json
# Remove the unwanted tags that were generated by InterText's TMX export
def remove_tags(tm):
tm2 = re.sub(r'<prop type="x-sentbreak">\|#\|</prop>', '', tm)
tm3 = re.sub(r'\|#\| ', '', tm2)
return tm3
# Re-insert metadata from pre_output/tmp file into TMX as XML tags.
# Also adds proper tags into the root and header.
def insert_metatags(tm, meta_data):
lines = tm.split('\n')
for num, s in enumerate(lines):
if '<tmx' in s:
lines[num] = re.sub('<tmx', '<tmx xmlns="http://www.lisa.org/tmx14" xmlns:eft="http://read.84000.co/ns/1.0" xmlns:tei="http://www.tei-c.org/ns/1.0"', s)
if '<header' in s:
grab_id = meta_data["milestones"]["$1"]
grab_id = re.sub(r'(UT22084-\d+-\d+)-\d+', r'\1', grab_id)
new = '<header creationtool="InterText" creationtoolversion="1.0" datatype="PlainText" segtype="block" adminlang="en-us" srclang="bo" o-tmf="XML aligned text" eft:text-id="'
new += grab_id
new += '" eft:text-version="'
new += meta_data["text_version"]["text_version"]
new += '"/>'
lines[num] = new
if '$' in s:
new = ''
for chunk in re.split(r'(\$\d+\s*)', lines[num]):
if '$' in chunk:
g_id = re.findall(r'\$\d+', chunk)
new += '<tei:milestone xml:id="'
new += meta_data["milestones"][g_id[0]]
new += '"/>'
else:
new += chunk
lines[num] = new
if '#' in s:
new = ''
for chunk in re.split(r'(#\d+)', lines[num]):
if chunk.startswith('#'):
g_id = re.findall(r'#\d+', chunk)
new += '<tei:note xml:id="'
new += meta_data["notes"][g_id[0]]
new += '"/>'
else:
new += chunk
lines[num] = new
lines = '\n'.join(lines)
return lines
# Convert flags made by TM editors for alternative sources and dubious
# translations into XML tags.
def create_flags(tm):
lines = tm.split('\n')
for num, s in enumerate(lines):
if re.search(r'<tuv xml:lang="bo">\s*\t*<seg>\s*!', s):
lines[num - 1] += '<eft:flag type="alternateSource"/>'
if re.search(r'<tuv xml:lang="en">\s*\t*<seg>\s*%', s):
lines[num - 2] += '<eft:flag type="dubiousTranslation"/>'
lines = '\n'.join(lines)
return lines
# Normalize Tibetan; remove spaces created by pybo and reformat folio refs into
# XML tags
def normalize_tibetan(tm):
segments = re.split(r'([</?seg>])', tm)
for num, s in enumerate(segments):
if re.search('་', s): # Find Tibetan segments according to Tsegs
s2 = re.sub(r'\[(\d+)\.?([ab])]\s?', r'<tei:ref folio="F.\1.\2"/>', s)
s3 = re.sub(r'\s(?![a-z])', '', s2)
segments[num] = re.sub('_', ' ', s3)
segments = ''.join(segments)
return segments
# Remove Folio Refs in English since they are already marked in the Tibetan
# and they might not line up exactly to the points where the translators have
# placed them.
def remove_folio_refs_en(tm):
lines = tm.split('\n')
for num, s in enumerate(lines):
if re.search(r'\[\d+\.?[ab]]\s?', s):
lines[num] = re.sub(r'\[(\d+)\.?([ab])]\s?', r'', s)
lines = '\n'.join(lines)
return lines
# Generate <prop> folio references for each TM unit
def create_folio_props(tm, meta_data):
tm_units = re.split('(<tu>)', tm)
folio_count = meta_data["first_page"]["first_page"]
for num, s in enumerate(tm_units):
if re.search(r'folio=', s):
folio_count = re.findall(r'folio="(F\.\d+\.[ab])', s)[0]
tm_units[num - 1] = '<tu>'
tm_units[num - 1] += '<prop name="folio">'
tm_units[num - 1] += folio_count
tm_units[num - 1] += '</prop>'
if re.search(r'<tu>', s):
tm_units[num] += '<prop name="folio">'
tm_units[num] += folio_count
tm_units[num] += '</prop>'
tm_units = ''.join(tm_units)
return tm_units
# This is the primary function in the script to process all TMX files exported
# from InterText in directory "post_input" and write to "post_output"
def postprocess(in_dir, out_dir):
# get path for TMX in post_input
for file in in_dir.glob('*.bo.en.tmx'):
if not file.is_file():
exit(f'{file} is missing.\nExiting')
# access .json file from 'output/tmp' folder
file_stem = str(file.name).replace('.bo.en.tmx','')
# note, rstrip removes any characters in argument string, but this
# works for 84000 project because all input files end with a number.
json_file = in_dir.parent / 'pre_output' / 'tmp' / (file_stem + '-bo.json')
if not json_file.is_file():
exit(f'{json_file} is missing.\nExiting')
read_meta_data = json_file.read_text(encoding='utf-8-sig')
meta_data = json.loads(read_meta_data)
# process TMX
tm = file.read_text(encoding='utf-8')
tm = remove_tags(tm)
tm = insert_metatags(tm, meta_data)
tm = create_flags(tm)
tm = normalize_tibetan(tm)
tm = remove_folio_refs_en(tm)
tm = create_folio_props(tm, meta_data)
to_file = out_dir / file.name
to_file.write_text(tm, encoding='utf-8')
if __name__ == '__main__':
in_dir = Path('post_input')
in_dir.mkdir(exist_ok=True)
out_dir = Path('post_output')
out_dir.mkdir(exist_ok=True, parents=True)
# empty output folder at each run
for o in out_dir.parent.glob('*.tmx'):
o.unlink()
for o in out_dir.glob('*.tmx'):
o.unlink()
postprocess(in_dir, out_dir)