postprocess-TMX.py

#!/usr/bin/env python
# coding: utf-8
from pathlib import Path
import re
import json


# Remove the unwanted tags that were generated by InterText's TMX export
def remove_tags(tm):
    tm2 = re.sub(r'<prop type="x-sentbreak">\|#\|</prop>', '', tm)
    tm3 = re.sub(r'\|#\| ', '', tm2)
    return tm3


# Re-insert metadata from pre_output/tmp file into TMX as XML tags.
# Also adds proper tags into the root and header.
def insert_metatags(tm, meta_data):
    lines = tm.split('\n')
    for num, s in enumerate(lines):
        if '<tmx' in s:
            lines[num] = re.sub('<tmx', '<tmx xmlns="http://www.lisa.org/tmx14" xmlns:eft="http://read.84000.co/ns/1.0" xmlns:tei="http://www.tei-c.org/ns/1.0"', s)
        if '<header' in s:
            grab_id = meta_data["milestones"]["$1"]
            grab_id = re.sub(r'(UT22084-\d+-\d+)-\d+', r'\1', grab_id)
            new = '<header creationtool="InterText" creationtoolversion="1.0" datatype="PlainText" segtype="block" adminlang="en-us" srclang="bo" o-tmf="XML aligned text" eft:text-id="'
            new += grab_id
            new += '" eft:text-version="'
            new += meta_data["text_version"]["text_version"]
            new += '"/>'
            lines[num] = new
        if '$' in s:
            new = ''
            for chunk in re.split(r'(\$\d+\s*)', lines[num]):
                if '$' in chunk:
                    g_id = re.findall(r'\$\d+', chunk)
                    new += '<tei:milestone xml:id="'
                    new += meta_data["milestones"][g_id[0]]
                    new += '"/>'
                else:
                    new += chunk
            lines[num] = new
        if '#' in s:
            new = ''
            for chunk in re.split(r'(#\d+)', lines[num]):
                if chunk.startswith('#'):
                    g_id = re.findall(r'#\d+', chunk)
                    new += '<tei:note xml:id="'
                    new += meta_data["notes"][g_id[0]]
                    new += '"/>'
                else:
                    new += chunk
            lines[num] = new
    lines = '\n'.join(lines)
    return lines


# Convert flags made by TM editors for alternative sources and dubious
# translations into XML tags.
def create_flags(tm):
    lines = tm.split('\n')
    for num, s in enumerate(lines):
        if re.search(r'<tuv xml:lang="bo">\s*\t*<seg>\s*!', s):
            lines[num - 1] += '<eft:flag type="alternateSource"/>'
        if re.search(r'<tuv xml:lang="en">\s*\t*<seg>\s*%', s):
            lines[num - 2] += '<eft:flag type="dubiousTranslation"/>'
    lines = '\n'.join(lines)
    return lines


# Normalize Tibetan; remove spaces created by pybo and reformat folio refs into
# XML tags
def normalize_tibetan(tm):
    segments = re.split(r'([</?seg>])', tm)
    for num, s in enumerate(segments):
        if re.search('་', s):  # Find Tibetan segments according to Tsegs
            s2 = re.sub(r'\[(\d+)\.?([ab])]\s?', r'<tei:ref folio="F.\1.\2"/>', s)
            s3 = re.sub(r'\s(?![a-z])', '', s2)
            segments[num] = re.sub('_', ' ', s3)
    segments = ''.join(segments)
    return segments


# Remove Folio Refs in English since they are already marked in the Tibetan
# and they might not line up exactly to the points where the translators have
# placed them.
def remove_folio_refs_en(tm):
    lines = tm.split('\n')
    for num, s in enumerate(lines):
        if re.search(r'\[\d+\.?[ab]]\s?', s):
            lines[num] = re.sub(r'\[(\d+)\.?([ab])]\s?', r'', s)
    lines = '\n'.join(lines)
    return lines


# Generate <prop> folio references for each TM unit
def create_folio_props(tm, meta_data):
    tm_units = re.split('(<tu>)', tm)
    folio_count = meta_data["first_page"]["first_page"]
    for num, s in enumerate(tm_units):
        if re.search(r'folio=', s):
            folio_count = re.findall(r'folio="(F\.\d+\.[ab])', s)[0]
            tm_units[num - 1] = '<tu>'
            tm_units[num - 1] += '<prop name="folio">'
            tm_units[num - 1] += folio_count
            tm_units[num - 1] += '</prop>'
        if re.search(r'<tu>', s):
            tm_units[num] += '<prop name="folio">'
            tm_units[num] += folio_count
            tm_units[num] += '</prop>'
    tm_units = ''.join(tm_units)
    return tm_units


# This is the primary function in the script to process all TMX files exported
# from InterText in directory "post_input" and write to "post_output"
def postprocess(in_dir, out_dir):
    # get path for TMX in post_input
    for file in in_dir.glob('*.bo.en.tmx'):
        if not file.is_file():
            exit(f'{file} is missing.\nExiting')

        # access .json file from 'output/tmp' folder
        file_stem = str(file.name).replace('.bo.en.tmx','')
        # note, rstrip removes any characters in argument string, but this
        # works for 84000 project because all input files end with a number.

        json_file = in_dir.parent / 'pre_output' / 'tmp' / (file_stem + '-bo.json')
        if not json_file.is_file():
            exit(f'{json_file} is missing.\nExiting')
        read_meta_data = json_file.read_text(encoding='utf-8-sig')
        meta_data = json.loads(read_meta_data)

        # process TMX
        tm = file.read_text(encoding='utf-8')
        tm = remove_tags(tm)
        tm = insert_metatags(tm, meta_data)
        tm = create_flags(tm)
        tm = normalize_tibetan(tm)
        tm = remove_folio_refs_en(tm)
        tm = create_folio_props(tm, meta_data)
        to_file = out_dir / file.name
        to_file.write_text(tm, encoding='utf-8')


if __name__ == '__main__':
    in_dir = Path('post_input')
    in_dir.mkdir(exist_ok=True)

    out_dir = Path('post_output')
    out_dir.mkdir(exist_ok=True, parents=True)
    # empty output folder at each run
    for o in out_dir.parent.glob('*.tmx'):
        o.unlink()
    for o in out_dir.glob('*.tmx'):
        o.unlink()

    postprocess(in_dir, out_dir)