Improve parsing of custom attributes

- Add functionality to parse arbitrary custom attributes - Add referenced text to attributes with an offset and length (and text) - Keep `reading_order` and `text_style` for backwards compatibility but use camelCased version in `custom_attributes`
knaw-huc · May 23, 2024 · 834196f · 834196f
1 parent bdb19f2
commit 834196f
Show file tree

Hide file tree

Showing 2 changed files with 169 additions and 33 deletions.
diff --git a/pagexml/parser.py b/pagexml/parser.py
@@ -51,12 +51,13 @@ def parse_line_words(textline: dict) -> List[pdm.PageXMLWord]:
             unicode_string = ""
         try:
             conf = None
+            custom = parse_custom_metadata(word_dict, element_text=unicode_string)
             if word_dict["TextEquiv"] is not None:
                 if "@conf" in word_dict["TextEquiv"]:
                     conf = word_dict["TextEquiv"]["@conf"]
             word = pdm.PageXMLWord(text=unicode_string,
                                    doc_id=word_dict['@id'] if '@id' in word_dict else None,
-                                   metadata=parse_custom_metadata(word_dict) if '@custom' in word_dict else None,
+                                   metadata=custom,
                                    coords=parse_coords(word_dict["Coords"]),
                                    conf=conf)
             words.append(word)
@@ -79,13 +80,13 @@ def parse_text_equiv(text_equiv: dict) -> Union[str, None]:
         return None
 
 
-def parse_textline(textline: dict, custom_tags: Iterable = []) -> pdm.PageXMLTextLine:
+def parse_textline(textline: dict, custom_tags: Iterable = None) -> pdm.PageXMLTextLine:
     text = parse_text_equiv(textline['TextEquiv']) if 'TextEquiv' in textline else None
     try:
         return pdm.PageXMLTextLine(
             xheight=int(textline['@xheight']) if '@xheight' in textline else None,
             doc_id=textline['@id'] if '@id' in textline else None,
-            metadata=parse_custom_metadata(textline, custom_tags)
+            metadata=parse_custom_metadata(textline, custom_tags=custom_tags, element_text=text)
             if '@custom' in textline
             else None,
             coords=parse_coords(textline['Coords']),
@@ -111,58 +112,95 @@ def parse_conf(text_element: dict) -> Union[float, None]:
         return None
 
 
-def parse_textline_list(textline_list: list, custom_tags: Iterable = []) -> List[pdm.PageXMLTextLine]:
+def parse_textline_list(textline_list: list, custom_tags: Iterable = None) -> List[pdm.PageXMLTextLine]:
+    """Parse a list TextLine dictionaries into a list of PageXMLTextLine objects."""
     return [parse_textline(textline, custom_tags) for textline in textline_list]
 
 
 def parse_custom_metadata_element(custom_string: str, custom_field: str) -> Dict[str, str]:
+    """Parse a custom metadata element from the custom attribute string.
+
+    Deprecated and kept for backwards compatibility. Please use parse_custom_attribute and
+    parse_custom_attribute_part."""
     match = re.search(r'\b' + custom_field + r' {(.*?)}', custom_string)
     if not match:
         print(custom_string)
         raise ValueError('Invalid structure metadata in custom attribute.')
-    structure_parts = match.group(1).strip().split(';')
-    metadata = {}
-    for part in structure_parts:
-        if part == '':
-            continue
-        field, value = part.split(':')
-        metadata[field] = value
+    metadata = parse_custom_attribute_parts(match.group(1))
     return metadata
 
 
 def parse_custom_metadata_element_list(custom_string: str, custom_field: str) -> List[Dict[str, str]]:
+    """Parse a repeated custom metadata element from the custom attribute string.
+
+    Deprecated and kept for backwards compatibility. Please use parse_custom_attribute and
+    parse_custom_attribute_part."""
     metadata_list = []
 
     matches = re.finditer(r'\b(' + custom_field + r') {(.*?)}', custom_string)
 
     for match in matches:
         tag = match.group(1)
-        metadata = {"type": tag}
-        structure_parts = match.group(2).strip().split(';')
+        metadata = parse_custom_attribute_parts(match.group(2))
+        metadata['type'] = tag
+        metadata_list.append(metadata)
 
-        for part in structure_parts:
-            if part == '':
-                continue
-            field, value = part.split(':')
+    return metadata_list
 
-            field = field.strip()
-            value = value.strip()
 
-            if field in ('offset', 'length'):
-                metadata[field] = int(value)
-            else:
-                metadata[field] = value
+def parse_custom_attributes(custom_string: str, element_text: str = None) -> List[Dict[str, any]]:
+    """Parse the custom attribute string of a PageXML element."""
 
-        metadata_list.append(metadata)
+    matches = re.finditer(r'\b(\w+) {(.*?)}', custom_string)
+    custom_attributes = []
+    for match in matches:
+        attribute = parse_custom_attribute_parts(match.group(2), element_text=element_text)
+        attribute['tag_name'] = match.group(1)
+        custom_attributes.append(attribute)
+    return custom_attributes
 
-    return metadata_list
 
+def parse_custom_attribute_parts(attribute_string: str, element_text: str = None) -> Dict[str, any]:
+    """Parse the string of custom attributes into a dictionary.
 
-def parse_custom_metadata(text_element: Dict[str, any], custom_tags: Iterable = []) -> Dict[str, any]:
-    """Parse custom metadata, like readingOrder, structure."""
+    Assumptions:
+
+    1. attributes are always and only separated by semicolons (;)
+    2. attribute key/value pairs are always separated by a colon (:)
+    3. there is no nesting of attributes. The attributes are a flat list
+    4. attribute values contain only alphanumeric characters, no punctuation or,
+       quotes, whitespace other symbols
+    """
+    structure_parts = attribute_string.strip().split(';')
     metadata = {}
+    for part in structure_parts:
+        if part == '':
+            continue
+        field, value = part.split(':')
+
+        field = field.strip()
+        value = value.strip()
+
+        if field in ('offset', 'length'):
+            metadata[field] = int(value)
+        else:
+            metadata[field] = value
+        if 'offset' in metadata and 'length' in metadata:
+            offset = metadata['offset']
+            length = metadata['length']
+            if element_text is not None:
+                metadata['text'] = element_text[offset:offset+length]
+    return metadata
+
+
+def parse_custom_metadata(text_element: Dict[str, any], custom_tags: Iterable = None,
+                          element_text: str = None) -> Dict[str, any]:
+    """Parse custom metadata, like readingOrder, structure, textStyle, unclear, abbrev."""
     if '@custom' not in text_element:
-        return metadata
+        return {}
+    metadata = {
+        'custom_attributes': parse_custom_attributes(text_element['@custom'], element_text=element_text)
+    }
     if 'readingOrder {' in text_element['@custom']:
         metadata['reading_order'] = parse_custom_metadata_element(text_element['@custom'], 'readingOrder')
     if 'structure {' in text_element['@custom']:
@@ -174,10 +212,11 @@ def parse_custom_metadata(text_element: Dict[str, any], custom_tags: Iterable =
     if custom_tags:
         regex_tags = r'(?:' + '|'.join(custom_tags) + r')'
         metadata['custom_tags'] = parse_custom_metadata_element_list(text_element['@custom'], regex_tags)
+    print(f"parser.parse_custom_metadata - metadata: {metadata}")
     return metadata
 
 
-def parse_textregion(text_region_dict: dict, custom_tags: Iterable = []) -> Union[pdm.PageXMLTextRegion, None]:
+def parse_textregion(text_region_dict: dict, custom_tags: Iterable = None) -> Union[pdm.PageXMLTextRegion, None]:
     text_region = pdm.PageXMLTextRegion(
         doc_id=text_region_dict['@id'] if '@id' in text_region_dict else None,
         orientation=float(text_region_dict['@orientation']) if '@orientation' in text_region_dict else None,
@@ -214,7 +253,7 @@ def parse_textregion(text_region_dict: dict, custom_tags: Iterable = []) -> Unio
     return text_region
 
 
-def parse_textregion_list(textregion_dict_list: list, custom_tags: Iterable = []) -> List[pdm.PageXMLTextRegion]:
+def parse_textregion_list(textregion_dict_list: list, custom_tags: Iterable = None) -> List[pdm.PageXMLTextRegion]:
     return [parse_textregion(textregion_dict, custom_tags) for textregion_dict in textregion_dict_list]
 
 
@@ -267,7 +306,7 @@ def parse_page_reading_order(page_json: dict) -> dict:
     return reading_order
 
 
-def parse_pagexml_json(pagexml_file: str, scan_json: dict, custom_tags: Iterable = []) -> pdm.PageXMLScan:
+def parse_pagexml_json(pagexml_file: str, scan_json: dict, custom_tags: Iterable = None) -> pdm.PageXMLScan:
     """Parse a JSON/xmltodict representation of a PageXML file and return a PageXMLScan object."""
     doc_id = pagexml_file
     coords, text_regions = None, None
@@ -311,8 +350,8 @@ def read_pagexml_file(pagexml_file: str, encoding: str = 'utf-8') -> str:
         return fh.read()
 
 
-def parse_pagexml_file(pagexml_file: str, pagexml_data: Union[str, None] = None, custom_tags: Iterable = {},
-                       encoding: str = 'utf-8') -> pdm.PageXMLScan:
+def parse_pagexml_file(pagexml_file: str, pagexml_data: Union[str, None] = None,
+                       custom_tags: Iterable = None, encoding: str = 'utf-8') -> pdm.PageXMLScan:
     """Read PageXML from file (or content of file passed separately if read from elsewhere,
     e.g. tarball) and return a PageXMLScan object.
 

diff --git a/tests/parser_test.py b/tests/parser_test.py
@@ -35,5 +35,102 @@ def test_parsing_from_json_sets_parents(self):
                 self.assertEqual(True, line.parent == tr)
 
 
+class TestCustomParser(unittest.TestCase):
+
+    def setUp(self) -> None:
+        custom = ("readingOrder {index:0;} abbrev {offset:0; length:4;} "
+                  "unclear {offset:0; length:4; continued:true;} "
+                  "unclear {offset:4; length:5; continued:true;} "
+                  "abbrev {offset:9; length:9;} "
+                  "unclear {offset:9; length:9; continued:true;} "
+                  "textStyle {offset:20; length:1;superscript:true;} "
+                  "madeup {offset:20; length:1;imaginary_attribute:true;} "
+                  "unclear {offset:18; length:33; continued:true;}")
+        self.element = {
+            '@custom': custom
+        }
+
+    def test_parse_metadata_element_list_single_type_as_list(self):
+        metadata = parser.parse_custom_metadata_element_list(self.element['@custom'], 'readingOrder')
+        self.assertEqual(1, len(metadata))
+
+    def test_parse_metadata_element_list_multi_type_as_list(self):
+        metadata = parser.parse_custom_metadata_element_list(self.element['@custom'], 'unclear')
+        self.assertEqual(4, len(metadata))
+
+    def test_parse_custom_attribute_part_returns_dict(self):
+        attributes = parser.parse_custom_attribute_parts('offset:9; length:5')
+        expected = {'offset': 9, 'length': 5}
+        self.assertEqual(expected, attributes)
+
+    def test_parse_custom_attribute_part_returns_text_with_offset(self):
+        text = "this is a text"
+        attribute = parser.parse_custom_attribute_parts('offset:9; length:5', element_text=text)
+        attrib_text = text[9:9+5]
+        self.assertEqual(True, 'text' in attribute)
+        self.assertEqual(attrib_text, attribute['text'])
+
+    def test_parse_custom_attributes_returns_list(self):
+        attributes = parser.parse_custom_attributes('unclear {offset:9; length:5}')
+        expected = [{'offset': 9, 'length': 5, 'tag_name': 'unclear'}]
+        self.assertEqual(expected, attributes)
+
+    def test_parse_custom_attributes_handles_semicolon_at_the_end(self):
+        attributes = parser.parse_custom_attributes('unclear {offset:9; length:5;}')
+        expected = [{'offset': 9, 'length': 5, 'tag_name': 'unclear'}]
+        self.assertEqual(expected, attributes)
+
+    def test_parse_custom_attributes_handles_arbitrary_whitespace(self):
+        attributes = parser.parse_custom_attributes('unclear {offset: 9;  length :5 ;}')
+        expected = [{'offset': 9, 'length': 5, 'tag_name': 'unclear'}]
+        self.assertEqual(expected, attributes)
+
+    def test_parse_custom_attributes_returns_list_with_repeated_elements(self):
+        custom_string = 'unclear {offset:9; length:5} unclear {offset:16; length: 2;}'
+        attributes = parser.parse_custom_attributes(custom_string)
+        expected = [
+            {'offset': 9, 'length': 5, 'tag_name': 'unclear'},
+            {'offset': 16, 'length': 2, 'tag_name': 'unclear'}
+        ]
+        self.assertEqual(expected, attributes)
+
+    def test_parse_custom_metadata_extracts_structure(self):
+        custom = parser.parse_custom_metadata({"@custom": "structure {type: resolution}"})
+        print(custom)
+        self.assertEqual(True, 'structure' in custom)
+        expected = {'type': 'resolution'}
+        self.assertEqual(expected, custom['structure'])
+        expected = [{'type': 'resolution', 'tag_name': 'structure'}]
+        self.assertEqual(expected, custom['custom_attributes'])
+
+    def test_parse_custom_metadata_extracts_all_tag_types(self):
+        custom = parser.parse_custom_metadata(self.element, custom_tags=['unclear'])
+        print(custom['custom_attributes'])
+        tag_types = {'readingOrder', 'unclear', 'abbrev', 'textStyle', 'madeup'}
+        self.assertEqual(tag_types, {attr['tag_name'] for attr in custom['custom_attributes']})
+
+    def test_parse_custom_metadata_returns_text_with_offset(self):
+        text = "this is a text"
+        custom = parser.parse_custom_metadata(self.element, element_text=text)
+        attrib_text = text[0:4]
+        unclears = [attr for attr in custom['custom_attributes'] if attr['tag_name'] == 'unclear']
+        self.assertEqual(True, 'text' in unclears[0])
+        self.assertEqual(attrib_text, unclears[0]['text'])
+
+    def test_parse_custom_metadata_extracts_unique_tag_as_dict(self):
+        custom = parser.parse_custom_metadata(self.element)
+        self.assertEqual(True, isinstance(custom['reading_order'], dict))
+
+    def test_parse_custom_metadata_extracts_multiple_tags_of_same_type_as_list(self):
+        custom = parser.parse_custom_metadata(self.element)
+        unclear = [attr for attr in custom['custom_attributes'] if attr['tag_name'] == 'unclear']
+        self.assertEqual(4, len(unclear))
+
+    def test_parse_custom_metadata_extracts_all_tags(self):
+        custom = parser.parse_custom_metadata(self.element)
+        print(custom)
+        self.assertEqual(9, len(custom['custom_attributes']))
+
+
 if __name__ == '__main__':
     unittest.main()