Merge pull request #11 from LvanWissen/master

knaw-huc · Sep 27, 2023 · 91f3170 · 91f3170
2 parents e121c92 + 2a7754a
commit 91f3170
Show file tree

Hide file tree

Showing 2 changed files with 89 additions and 16 deletions.
diff --git a/pagexml/helper/pagexml_helper.py b/pagexml/helper/pagexml_helper.py
@@ -316,6 +316,44 @@ def read_line_format_file(line_format_files: Union[str, List[str]],
                     yield {header: row[hi] if len(row) > hi else None for hi, header in enumerate(headers)}
 
 
+def get_custom_tags(doc: pdm.PageXMLDoc) -> List[Dict[str, any]]:
+    """
+    Get all custom tags and their textual values from a PageXMLDoc.
+
+    This function assumes that the PageXML document is generated with
+    input of some `custom_tags` in the parse_pagexml_file function.
+    This helper retrieves those tags from all TextLines and finds the
+    corresponding text from their offset and length. It returns a
+    dictionary with the tag type, the textual value, region and line
+    id, and the offset and length.
+
+    :param doc: A PageXMLDoc
+    :type doc: pdm.PageXMLDoc
+    :return: List of custom tags
+    :rtype: List[Dict[str, any]]
+    """
+    custom_tags = []
+
+    for region in doc.text_regions:
+        for line in region.lines:
+            for tag_el in line.metadata.get("custom_tags", []):
+                tag = tag_el["type"]
+                offset = tag_el["offset"]
+                length = tag_el["length"]
+
+                value = line.text[offset:offset+length]
+
+                custom_tags.append({
+                    "type": tag, 
+                    "value": value, 
+                    "region_id": region.id, 
+                    "line_id": line.id,                     
+                    "offset": offset, 
+                    "length": length,
+                })
+
+    return custom_tags
+
 class LineIterable:
 
     def __init__(self, line_format_files: Union[str, List[str]], headers: List[str] = None):

diff --git a/pagexml/parser.py b/pagexml/parser.py
@@ -3,7 +3,7 @@
 import re
 from xml.parsers import expat
 from datetime import datetime
-from typing import Generator, List, Dict, Union
+from typing import Generator, List, Dict, Union, Iterable
 
 import xmltodict
 from dateutil.parser import parse as date_parse
@@ -78,13 +78,13 @@ def parse_text_equiv(text_equiv: dict) -> Union[str, None]:
         return None
 
 
-def parse_textline(textline: dict) -> pdm.PageXMLTextLine:
+def parse_textline(textline: dict, custom_tags: Iterable = []) -> pdm.PageXMLTextLine:
     text = parse_text_equiv(textline['TextEquiv']) if 'TextEquiv' in textline else None
     try:
         return pdm.PageXMLTextLine(
             xheight=int(textline['@xheight']) if '@xheight' in textline else None,
             doc_id=textline['@id'] if '@id' in textline else None,
-            metadata=parse_custom_metadata(textline)
+            metadata=parse_custom_metadata(textline, custom_tags)
             if '@custom' in textline
             else None,
             coords=parse_coords(textline['Coords']),
@@ -110,8 +110,8 @@ def parse_conf(text_element: dict) -> Union[float, None]:
         return None
 
 
-def parse_textline_list(textline_list: list) -> List[pdm.PageXMLTextLine]:
-    return [parse_textline(textline) for textline in textline_list]
+def parse_textline_list(textline_list: list, custom_tags: Iterable = []) -> List[pdm.PageXMLTextLine]:
+    return [parse_textline(textline, custom_tags) for textline in textline_list]
 
 
 def parse_custom_metadata_element(custom_string: str, custom_field: str) -> Dict[str, str]:
@@ -129,7 +129,35 @@ def parse_custom_metadata_element(custom_string: str, custom_field: str) -> Dict
     return metadata
 
 
-def parse_custom_metadata(text_element: Dict[str, any]) -> Dict[str, any]:
+def parse_custom_metadata_element_list(custom_string: str, custom_field: str) -> List[Dict[str, str]]:
+    metadata_list = []
+
+    matches = re.finditer(r'\b(' + custom_field + r') {(.*?)}', custom_string)
+
+    for match in matches:
+        tag = match.group(1)
+        metadata = {"type": tag}
+        structure_parts = match.group(2).strip().split(';')
+
+        for part in structure_parts:
+            if part == '':
+                continue
+            field, value = part.split(':')
+
+            field = field.strip()
+            value = value.strip()
+
+            if field in ('offset', 'length'):
+                metadata[field] = int(value)
+            else:
+                metadata[field] = value
+
+        metadata_list.append(metadata)
+
+    return metadata_list
+
+
+def parse_custom_metadata(text_element: Dict[str, any], custom_tags: Iterable = []) -> Dict[str, any]:
     """Parse custom metadata, like readingOrder, structure."""
     metadata = {}
     if '@custom' not in text_element:
@@ -140,10 +168,15 @@ def parse_custom_metadata(text_element: Dict[str, any]) -> Dict[str, any]:
         metadata['structure'] = parse_custom_metadata_element(text_element['@custom'], 'structure')
         if 'type' in metadata['structure']:
             metadata['type'] = metadata['structure']['type']
+    if 'textStyle {' in text_element['@custom']:
+        metadata['text_style'] = parse_custom_metadata_element_list(text_element['@custom'], 'textStyle')
+    if custom_tags:
+        regex_tags = r'(?:' + '|'.join(custom_tags) + r')'
+        metadata['custom_tags'] = parse_custom_metadata_element_list(text_element['@custom'], regex_tags)
     return metadata
 
 
-def parse_textregion(text_region_dict: dict) -> Union[pdm.PageXMLTextRegion, None]:
+def parse_textregion(text_region_dict: dict, custom_tags: Iterable = []) -> Union[pdm.PageXMLTextRegion, None]:
     text_region = pdm.PageXMLTextRegion(
         doc_id=text_region_dict['@id'] if '@id' in text_region_dict else None,
         orientation=float(text_region_dict['@orientation']) if '@orientation' in text_region_dict else None,
@@ -157,17 +190,17 @@ def parse_textregion(text_region_dict: dict) -> Union[pdm.PageXMLTextRegion, Non
             text_region.text = parse_text_equiv(text_region_dict[child])
         if child == 'TextLine':
             if isinstance(text_region_dict['TextLine'], list):
-                text_region.lines = parse_textline_list(text_region_dict['TextLine'])
+                text_region.lines = parse_textline_list(text_region_dict['TextLine'], custom_tags)
             else:
-                text_region.lines = [parse_textline(text_region_dict['TextLine'])]
+                text_region.lines = [parse_textline(text_region_dict['TextLine'], custom_tags)]
             text_region.set_as_parent(text_region.lines)
             if not text_region.coords:
                 text_region.coords = parse_derived_coords(text_region.lines)
         if child == 'TextRegion':
             text_region.text_regions = []
             if isinstance(text_region_dict['TextRegion'], list) is False:
                 text_region_dict['TextRegion'] = [text_region_dict['TextRegion']]
-            for tr in parse_textregion_list(text_region_dict['TextRegion']):
+            for tr in parse_textregion_list(text_region_dict['TextRegion'], custom_tags):
                 if tr is not None:
                     text_region.text_regions.append(tr)
             text_region.set_as_parent(text_region.text_regions)
@@ -180,8 +213,8 @@ def parse_textregion(text_region_dict: dict) -> Union[pdm.PageXMLTextRegion, Non
     return text_region
 
 
-def parse_textregion_list(textregion_dict_list: list) -> List[pdm.PageXMLTextRegion]:
-    return [parse_textregion(textregion_dict) for textregion_dict in textregion_dict_list]
+def parse_textregion_list(textregion_dict_list: list, custom_tags: Iterable = []) -> List[pdm.PageXMLTextRegion]:
+    return [parse_textregion(textregion_dict, custom_tags) for textregion_dict in textregion_dict_list]
 
 
 def parse_page_metadata(metadata_json: dict) -> dict:
@@ -233,7 +266,7 @@ def parse_page_reading_order(page_json: dict) -> dict:
     return reading_order
 
 
-def parse_pagexml_json(pagexml_file: str, scan_json: dict) -> pdm.PageXMLScan:
+def parse_pagexml_json(pagexml_file: str, scan_json: dict, custom_tags: Iterable = []) -> pdm.PageXMLScan:
     """Parse a JSON/xmltodict representation of a PageXML file and return a PageXMLScan object."""
     doc_id = pagexml_file
     coords, text_regions = None, None
@@ -254,7 +287,7 @@ def parse_pagexml_json(pagexml_file: str, scan_json: dict) -> pdm.PageXMLScan:
         text_regions = []
         if isinstance(scan_json['TextRegion'], list) is False:
             scan_json['TextRegion'] = [scan_json['TextRegion']]
-        for tr in parse_textregion_list(scan_json['TextRegion']):
+        for tr in parse_textregion_list(scan_json['TextRegion'], custom_tags=custom_tags):
             if tr is not None:
                 text_regions.append(tr)
     if 'ReadingOrder' in scan_json and scan_json['ReadingOrder']:
@@ -277,7 +310,7 @@ def read_pagexml_file(pagexml_file: str, encoding: str = 'utf-8') -> str:
         return fh.read()
 
 
-def parse_pagexml_file(pagexml_file: str, pagexml_data: Union[str, None] = None,
+def parse_pagexml_file(pagexml_file: str, pagexml_data: Union[str, None] = None, custom_tags: Iterable = {}, 
                        encoding: str = 'utf-8') -> pdm.PageXMLScan:
     """Read PageXML from file (or content of file passed separately if read from elsewhere,
     e.g. tarball) and return a PageXMLScan object.
@@ -286,6 +319,8 @@ def parse_pagexml_file(pagexml_file: str, pagexml_data: Union[str, None] = None,
     :type pagexml_file: str
     :param pagexml_data: string representation of PageXML document (corresponding to the content of pagexml_file)
     :type pagexml_data: str
+    :param custom_tags: list of custom tags to be parsed in the metadata
+    :type custom_tags: list
     :param encoding: the encoding of the file (default utf-8)
     :type encoding: str
     :return: a pdm.PageXMLScan object
@@ -295,7 +330,7 @@ def parse_pagexml_file(pagexml_file: str, pagexml_data: Union[str, None] = None,
         pagexml_data = read_pagexml_file(pagexml_file, encoding=encoding)
     scan_json = xmltodict.parse(pagexml_data)
     try:
-        scan_doc = parse_pagexml_json(pagexml_file, scan_json)
+        scan_doc = parse_pagexml_json(pagexml_file, scan_json, custom_tags=custom_tags)
     except BaseException:
         print(f'Error parsing file {pagexml_file}')
         raise