bbc · nigelmegitt · Nov 23, 2021 · Nov 26, 2021 · Jan 17, 2023 · Oct 20, 2023
diff --git a/docs/source/ebu_tt_live.scripts.rst b/docs/source/ebu_tt_live.scripts.rst
@@ -58,3 +58,12 @@ scripts Package
     :undoc-members:
     :show-inheritance:
 
+
+:mod:`imsc_hrm_validator` Module
+-------------------------------------
+
+.. autoclass:: ebu_tt_live.scripts.imsc_hrm_validator.imscHrmValidator
+    :members:
+
+.. automodule:: ebu_tt_live.scripts.imsc_hrm_validator
+    :show-inheritance:
diff --git a/docs/source/scripts_and_their_functions.rst b/docs/source/scripts_and_their_functions.rst
@@ -226,10 +226,15 @@ This script loads a file from the file system and attempts to validate it
 as the specified format, either EBU-TT Part 1, EBU-TT Part 3 or EBU-TT-D.
 By default the expected format is EBU-TT-D.
 
+Additionally, EBU-TT-D documents can be validated against the
+`IMSC-HRM <https://www.w3.org/TR/imsc-hrm/>`_ by adding the ``--hrm`` flag.
+
 Example command lines:
 
 ``validator -i path/to/ebu-tt-1-file-to-test.xml -f 1``
 
 ``validator -i path/to/ebu-tt-3-file-to-test.xml -f 3``
 
 ``validator -i path/to/ebu-tt-d-file-to-test.xml -f D``
+
+``validator -i path/to/ebu-tt-d-file-to-test.xml -f D --hrm``
diff --git a/docs/source/validation_framework.rst b/docs/source/validation_framework.rst
@@ -146,3 +146,20 @@ by using the context manager class and instead of the context being passed
 around as a parameter among functions the binding classes call the
 :py:func:`ebu_tt_live.bindings.pyxb_utils.get_xml_parsing_context` function to
 gain access to the parsing context object.
+
+
+Validation outside document objects
+===================================
+
+When constraints beyond the document specification need to be validated,
+validation code can be written outside the document and bindings objects themselves.
+
+IMSC-HRM validation
+-------------------
+
+The :py:class:`ebu_tt_live.scripts.imscHrmValidator` class is an example
+of such out-of-document validation. It provides a single
+:py:func:`ebu_tt_live.scripts.imscHrmValidator.validate` method that
+processes the provided validated EBU-TT-D document, according to the
+`IMSC-HRM <https://www.w3.org/TR/imsc-hrm/>`_ algorithm,
+and returns true or false as appropriate.
diff --git a/ebu_tt_live/adapters/document_data.py b/ebu_tt_live/adapters/document_data.py
@@ -73,7 +73,11 @@ class XMLtoEBUTTDAdapter(IDocumentDataAdapter):
     _provides = EBUTTDDocument
 
     def convert_data(self, data, **kwargs):
-        return EBUTTDDocument.create_from_xml(data), kwargs
+        doc = EBUTTDDocument.create_from_xml(data)
+        kwargs.update(dict(
+            raw_xml=data
+        ))
+        return doc, kwargs
 
 
 class EBUTTDtoXMLAdapter(IDocumentDataAdapter):

diff --git a/ebu_tt_live/adapters/test/test_data/testEbuttd.xml b/ebu_tt_live/adapters/test/test_data/testEbuttd.xml
@@ -0,0 +1,20 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<tt:tt ttp:timeBase="media" xml:lang="en-GB" xmlns:ebuttm="urn:ebu:tt:metadata" xmlns:ebuttp="urn:ebu:tt:parameters" xmlns:tt="http://www.w3.org/ns/ttml" xmlns:ttp="http://www.w3.org/ns/ttml#parameter" xmlns:tts="http://www.w3.org/ns/ttml#styling" xmlns:xml="http://www.w3.org/XML/1998/namespace">
+    <tt:head>
+        <tt:metadata>
+            <ebuttm:documentMetadata/>
+        </tt:metadata>
+        <tt:styling>
+            <tt:style xml:id="s0"/>
+        </tt:styling>
+        <tt:layout>
+            <tt:region xml:id="r0" tts:origin="10% 10%" tts:extent="80% 80%"></tt:region>
+        </tt:layout>
+    </tt:head>
+    <tt:body>
+        <tt:div>
+            <tt:p xml:id="ID001" begin="01:23:45.670" end="01:23:45.890">It only took me six days.</tt:p>
+        </tt:div>
+    </tt:body>
+</tt:tt>
+
diff --git a/ebu_tt_live/adapters/test/test_document_data_adapters.py b/ebu_tt_live/adapters/test/test_document_data_adapters.py
@@ -136,11 +136,49 @@ def test_sequence_id_mismatch(self):
 
 
 class TestXMLtoEBUTTDAdapter(TestCase):
-    _output_type = documents.EBUTTDDocument
     _adapter_class = document_data.XMLtoEBUTTDAdapter
-    _expected_keys = []
+    _test_xml_file = 'testEbuttd.xml'
+    _test_data_dir_path = os.path.join(os.path.dirname(__file__), 'test_data')
+    _test_xml_path = os.path.join(_test_data_dir_path, _test_xml_file)
+    _output_type = documents.EBUTTDDocument
+    _expected_keys = [
+        'raw_xml'
+    ]
+    instance = None
+
+    def setUp(self):
+        self.instance = self._adapter_class()
+        self.assertIsInstance(self.instance, IDocumentDataAdapter)
 
-    # TODO: Finish this once we have EBUTT-D parsing
+    def _assert_output_type(self, result):
+        self.assertIsInstance(result, self._output_type)
+
+    def _assert_kwargs_passtrough(self, result_kwargs, expected_keys):
+        self.assertEqual(set(result_kwargs.keys()), set(expected_keys))
+
+    def _get_xml(self):
+        with open(self._test_xml_path, 'r') as xml_file:
+            xml_data = xml_file.read()
+        return xml_data
+
+    def _get_input(self):
+        return self._get_xml()
+
+    def test_success(self):
+        expected_keys = []
+        expected_keys.extend(self._expected_keys)
+        result, res_kwargs = self.instance.convert_data(self._get_input())
+        self._assert_output_type(result)
+        self._assert_kwargs_passtrough(res_kwargs, expected_keys)
+
+    def test_kwargs_passthrough(self):
+        in_kwargs = {
+            'foo': 'bar'
+        }
+        expected_keys = ['foo']
+        expected_keys.extend(self._expected_keys)
+        result, res_kwargs = self.instance.convert_data(self._get_input(), **in_kwargs)
+        self._assert_kwargs_passtrough(res_kwargs, expected_keys)
 
 
 class TestEBUTT3toXMLAdapter(TestXMLtoEBUTT3Adapter):
@@ -164,20 +202,18 @@ def test_sequence_id_match(self):
         pass
 
 
-class TestEBUTTDtoXMLAdapter(TestEBUTT3toXMLAdapter):
+class TestEBUTTDtoXMLAdapter(TestXMLtoEBUTTDAdapter):
+    _output_type = six.text_type
     _adapter_class = document_data.EBUTTDtoXMLAdapter
     _expected_keys = []
 
+    def _get_input(self):
+        return documents.EBUTTDDocument.create_from_xml(self._get_xml())
+
     def _get_input(self):
         input_doc = documents.EBUTTDDocument(lang='en-GB')
         return input_doc
 
-    def test_sequence_id_mismatch(self):
-        pass
-
-    def test_sequence_id_match(self):
-        pass
-
 
 class TestEBUTT3toEBUTTDAdapter(TestXMLtoEBUTT3Adapter):
     _adapter_class = document_data.EBUTT3toEBUTTDAdapter

diff --git a/ebu_tt_live/bindings/__init__.py b/ebu_tt_live/bindings/__init__.py
@@ -1539,6 +1539,11 @@ def _validateBinding_vx(self):
 
         super(d_tt_type, self)._validateBinding_vx()
 
+    def get_timing_type(self, timedelta_in):
+        if self.timeBase == 'media':
+            return ebuttdt.FullClockTimingType(timedelta_in)
+        else:
+            log.error('d_tt_type.get_timing_type() where self.timeBase == {}'.format(self.timeBase))
 
 raw.d_tt_type._SetSupersedingClass(d_tt_type)
 
@@ -1942,6 +1947,10 @@ def _semantic_before_traversal(
             parent_binding=None):
         self._semantic_preprocess_timing(
              dataset=dataset, element_content=element_content)
+        self._semantic_collect_applicable_styles(
+            dataset=dataset,
+            style_type=style_type,
+            parent_binding=parent_binding)
 
     def _semantic_after_traversal(
             self,
@@ -2041,6 +2050,9 @@ def _validateBinding_vx(self):
         raw.layout: layout,
         raw.body_type: body_type,
         },
+    'ebuttd': {
+        raw.d_tt_type: d_tt_type,
+        },
     }
 
 

diff --git a/ebu_tt_live/bindings/_ebuttdt.py b/ebu_tt_live/bindings/_ebuttdt.py
@@ -68,8 +68,11 @@ def _ConvertArguments_vx(cls, args, kw):
         context = get_xml_parsing_context()
         if context is not None:
             # This means we are in XML parsing context. There should be a timeBase and a timing_attribute_name in the
-            # context object.
-            time_base = context['timeBase']
+            # context object. But if there's no timeBase, in the context
+            # of EBU-TT-D, we will assume media. Some files in the wild
+            # trigger this behaviour, for reasons not yet identified, i.e.
+            # we somehow get here without having a timeBase context set.
+            time_base = context.get('timeBase', 'media')
             # It is possible for a timing type to exist as the value of an element not an attribute,
             # in which case no timing_attribute_name is in the context; in that case don't attempt
             # to validate the data against a timebase. At the moment this only affects the
@@ -611,6 +614,9 @@ def _do_eq(self, other):
 
     def __eq__(self, other):
         return self._do_eq(other)
+
+    def __hash__(self):
+        return hash((self.horizontal, self.vertical))
 
 
 ebuttdt_raw.cellFontSizeType._SetSupersedingClass(CellFontSizeType)

diff --git a/ebu_tt_live/bindings/pyxb_utils.py b/ebu_tt_live/bindings/pyxb_utils.py
@@ -23,7 +23,7 @@ def get_xml_parsing_context():
     into account the timeBase attribute on the tt element. In that case when the timeBase element is encountered by the
     parser is is added to the parsing context object to help PyXB make the right type in the timingType union.
 
-    :return: dict that is te parsing context for the currently running parser
+    :return: dict that is the parsing context for the currently running parser
     :return: None if not in parsing mode
     """
     log.debug('Accessing xml_parsing_context: {}'.format(__xml_parsing_context))

diff --git a/ebu_tt_live/documents/ebuttd.py b/ebu_tt_live/documents/ebuttd.py
@@ -18,6 +18,7 @@ class EBUTTDDocument(SubtitleDocument, TimelineUtilMixin):
     _encoding = 'UTF-8'
 
     def __init__(self, lang):
+        self.load_types_for_document()
         self._ebuttd_content = bindings.ttd(
             timeBase='media',
             head=bindings.d_head_type(
@@ -46,13 +47,23 @@ def validate(self):
             document=self
         )
 
+    @classmethod
+    def load_types_for_document(cls):
+        bindings.load_types_for_document('ebuttd')
+
     @classmethod
     def create_from_xml(cls, xml):
         # NOTE: This is a workaround to make the bindings accept separate root element identities
         # for the same name. tt comes in but we rename it to ttd to make the xsd validate.
+        cls.load_types_for_document()
         xml_dom = minidom.parseString(xml)
-        if xml_dom.documentElement.tagName == 'tt':
-            xml_dom.documentElement.tagName = 'ttd'
+        if xml_dom.documentElement.namespaceURI == 'http://www.w3.org/ns/ttml':
+            if xml_dom.documentElement.prefix is not None and \
+               xml_dom.documentElement.prefix != '' and \
+               xml_dom.documentElement.tagName == xml_dom.documentElement.prefix + ':tt':
+                xml_dom.documentElement.tagName = xml_dom.documentElement.prefix +  ':ttd'
+            elif xml_dom.documentElement.tagName == 'tt':
+                xml_dom.documentElement.tagName = 'ttd'
         instance = cls.create_from_raw_binding(
             binding=bindings.CreateFromDOM(
                 xml_dom
@@ -62,6 +73,7 @@ def create_from_xml(cls, xml):
 
     @classmethod
     def create_from_raw_binding(cls, binding):
+        cls.load_types_for_document()
         instance = cls.__new__(cls)
         instance._ebuttd_content = binding
         return instance

diff --git a/ebu_tt_live/gen_uax24.py b/ebu_tt_live/gen_uax24.py
@@ -0,0 +1,116 @@
+"""Process the UAX24 scripts at https://www.unicode.org/Public/UCD/latest/ucd/Scripts.txt
+to generate a Python equivalent.
+
+For example a command like:
+python ebu_tt_live/gen_uax24.py -scriptFile uax24scripts.txt -outFile ebu_tt_live/uax24.py
+
+will generate a Python file that specifies script lists that can be queried.
+"""
+
+import argparse
+import sys
+from csv import reader
+
+LIST_SUFFIX='_list'
+TRIPLE_QUOTE='"""'
+SCRIPTS_TO_LIST={
+    'Common': [],
+    'Latin': [],
+    'Greek': [],
+    'Cyrillic': [],
+    'Hebrew': [],
+    'Han': [],
+    'Katakana': [],
+    'Hiragana': [],
+    'Bopomofo': [],
+    'Hangul': [],
+}
+
+# https://stackoverflow.com/questions/14158868/python-skip-comment-lines-marked-with-in-csv-dictreader
+def decomment(csvfile):
+    for row in csvfile:
+        raw = row.split('#')[0].strip()
+        if raw: yield raw
+
+def writeComments(outFile):
+    outFile.write(TRIPLE_QUOTE)
+    outFile.write(
+        'Utility for discovering which UAX24 script a given character code is in,\n'
+        'useful for example in computing the copy or render times in the IMSC-HRM.\n'
+        '\n'
+        'Auto-generated from UAX24 Scripts.txt using gen_uax24.py\n')
+    outFile.write(TRIPLE_QUOTE)
+    outFile.write('\n')
+    return
+
+def writeFuncs(outFile):
+    outFile.write(
+        'def lr(a, b):\n'
+        '    return list(range(a, b + 1))\n'
+        '\n')
+    return
+
+def genLists(csv_reader):
+    for row in csv_reader:
+        scr = row[1].strip().split(' ', maxsplit=1)[0]
+        if scr in SCRIPTS_TO_LIST:
+            SCRIPTS_TO_LIST[scr].append(row[0].strip())
+    return
+
+def charOrRange(char_code: str) -> str:
+    range_indicator = char_code.find('..')
+    if range_indicator != -1:
+        return '*lr(0x{}, 0x{})'.format(
+            char_code[0:range_indicator],
+            char_code[range_indicator+2:]  # assume already stripped of trailing spaces
+        )
+    else:
+        return '0x{}'.format(char_code)
+
+def writeLists(outFile):
+    for script, char_codes in SCRIPTS_TO_LIST.items():
+        outFile.write('\n{}{} = [\n'.format(script, LIST_SUFFIX))
+        for char_code in char_codes:
+            outFile.write('    {},\n'.format(
+                charOrRange(char_code)
+            ))
+        outFile.write(']\n')
+    return
+
+def generateUax24(args) -> int:
+    csv_reader = reader(decomment(args.scriptFile), delimiter=';', skipinitialspace=True)
+    outFile = args.outFile
+    writeComments(outFile)
+    writeFuncs(outFile)
+    genLists(csv_reader)
+    writeLists(outFile)
+
+    return 1
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        '-scriptFile',
+        type=argparse.FileType('rt'),
+        required=True,
+        help='UAX24 Scripts file',
+        action='store')
+
+    parser.add_argument(
+        '-outFile',
+        type=argparse.FileType('wt'),
+        default=sys.stdout,
+        nargs='?',
+        help='Location to write the python file representing the scripts',
+        action='store')
+
+    parser.set_defaults(func=generateUax24)
+
+    args = parser.parse_args()
+    return args.func(args)
+
+
+if __name__ == "__main__":
+    # execute only if run as a script
+    main()