Skip to content

Commit

Permalink
Improve parsing of custom attributes
Browse files Browse the repository at this point in the history
- Add functionality to parse arbitrary custom attributes
- Add referenced text to attributes with an offset and length (and text)
- Keep `reading_order` and `text_style` for backwards compatibility but use camelCased version in `custom_attributes`
  • Loading branch information
marijnkoolen committed May 23, 2024
1 parent bdb19f2 commit 834196f
Show file tree
Hide file tree
Showing 2 changed files with 169 additions and 33 deletions.
105 changes: 72 additions & 33 deletions pagexml/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,13 @@ def parse_line_words(textline: dict) -> List[pdm.PageXMLWord]:
unicode_string = ""
try:
conf = None
custom = parse_custom_metadata(word_dict, element_text=unicode_string)
if word_dict["TextEquiv"] is not None:
if "@conf" in word_dict["TextEquiv"]:
conf = word_dict["TextEquiv"]["@conf"]
word = pdm.PageXMLWord(text=unicode_string,
doc_id=word_dict['@id'] if '@id' in word_dict else None,
metadata=parse_custom_metadata(word_dict) if '@custom' in word_dict else None,
metadata=custom,
coords=parse_coords(word_dict["Coords"]),
conf=conf)
words.append(word)
Expand All @@ -79,13 +80,13 @@ def parse_text_equiv(text_equiv: dict) -> Union[str, None]:
return None


def parse_textline(textline: dict, custom_tags: Iterable = []) -> pdm.PageXMLTextLine:
def parse_textline(textline: dict, custom_tags: Iterable = None) -> pdm.PageXMLTextLine:
text = parse_text_equiv(textline['TextEquiv']) if 'TextEquiv' in textline else None
try:
return pdm.PageXMLTextLine(
xheight=int(textline['@xheight']) if '@xheight' in textline else None,
doc_id=textline['@id'] if '@id' in textline else None,
metadata=parse_custom_metadata(textline, custom_tags)
metadata=parse_custom_metadata(textline, custom_tags=custom_tags, element_text=text)
if '@custom' in textline
else None,
coords=parse_coords(textline['Coords']),
Expand All @@ -111,58 +112,95 @@ def parse_conf(text_element: dict) -> Union[float, None]:
return None


def parse_textline_list(textline_list: list, custom_tags: Iterable = []) -> List[pdm.PageXMLTextLine]:
def parse_textline_list(textline_list: list, custom_tags: Iterable = None) -> List[pdm.PageXMLTextLine]:
"""Parse a list TextLine dictionaries into a list of PageXMLTextLine objects."""
return [parse_textline(textline, custom_tags) for textline in textline_list]


def parse_custom_metadata_element(custom_string: str, custom_field: str) -> Dict[str, str]:
"""Parse a custom metadata element from the custom attribute string.
Deprecated and kept for backwards compatibility. Please use parse_custom_attribute and
parse_custom_attribute_part."""
match = re.search(r'\b' + custom_field + r' {(.*?)}', custom_string)
if not match:
print(custom_string)
raise ValueError('Invalid structure metadata in custom attribute.')
structure_parts = match.group(1).strip().split(';')
metadata = {}
for part in structure_parts:
if part == '':
continue
field, value = part.split(':')
metadata[field] = value
metadata = parse_custom_attribute_parts(match.group(1))
return metadata


def parse_custom_metadata_element_list(custom_string: str, custom_field: str) -> List[Dict[str, str]]:
"""Parse a repeated custom metadata element from the custom attribute string.
Deprecated and kept for backwards compatibility. Please use parse_custom_attribute and
parse_custom_attribute_part."""
metadata_list = []

matches = re.finditer(r'\b(' + custom_field + r') {(.*?)}', custom_string)

for match in matches:
tag = match.group(1)
metadata = {"type": tag}
structure_parts = match.group(2).strip().split(';')
metadata = parse_custom_attribute_parts(match.group(2))
metadata['type'] = tag
metadata_list.append(metadata)

for part in structure_parts:
if part == '':
continue
field, value = part.split(':')
return metadata_list

field = field.strip()
value = value.strip()

if field in ('offset', 'length'):
metadata[field] = int(value)
else:
metadata[field] = value
def parse_custom_attributes(custom_string: str, element_text: str = None) -> List[Dict[str, any]]:
"""Parse the custom attribute string of a PageXML element."""

metadata_list.append(metadata)
matches = re.finditer(r'\b(\w+) {(.*?)}', custom_string)
custom_attributes = []
for match in matches:
attribute = parse_custom_attribute_parts(match.group(2), element_text=element_text)
attribute['tag_name'] = match.group(1)
custom_attributes.append(attribute)
return custom_attributes

return metadata_list

def parse_custom_attribute_parts(attribute_string: str, element_text: str = None) -> Dict[str, any]:
"""Parse the string of custom attributes into a dictionary.
def parse_custom_metadata(text_element: Dict[str, any], custom_tags: Iterable = []) -> Dict[str, any]:
"""Parse custom metadata, like readingOrder, structure."""
Assumptions:
1. attributes are always and only separated by semicolons (;)
2. attribute key/value pairs are always separated by a colon (:)
3. there is no nesting of attributes. The attributes are a flat list
4. attribute values contain only alphanumeric characters, no punctuation or,
quotes, whitespace other symbols
"""
structure_parts = attribute_string.strip().split(';')
metadata = {}
for part in structure_parts:
if part == '':
continue
field, value = part.split(':')

field = field.strip()
value = value.strip()

if field in ('offset', 'length'):
metadata[field] = int(value)
else:
metadata[field] = value
if 'offset' in metadata and 'length' in metadata:
offset = metadata['offset']
length = metadata['length']
if element_text is not None:
metadata['text'] = element_text[offset:offset+length]
return metadata


def parse_custom_metadata(text_element: Dict[str, any], custom_tags: Iterable = None,
element_text: str = None) -> Dict[str, any]:
"""Parse custom metadata, like readingOrder, structure, textStyle, unclear, abbrev."""
if '@custom' not in text_element:
return metadata
return {}
metadata = {
'custom_attributes': parse_custom_attributes(text_element['@custom'], element_text=element_text)
}
if 'readingOrder {' in text_element['@custom']:
metadata['reading_order'] = parse_custom_metadata_element(text_element['@custom'], 'readingOrder')
if 'structure {' in text_element['@custom']:
Expand All @@ -174,10 +212,11 @@ def parse_custom_metadata(text_element: Dict[str, any], custom_tags: Iterable =
if custom_tags:
regex_tags = r'(?:' + '|'.join(custom_tags) + r')'
metadata['custom_tags'] = parse_custom_metadata_element_list(text_element['@custom'], regex_tags)
print(f"parser.parse_custom_metadata - metadata: {metadata}")
return metadata


def parse_textregion(text_region_dict: dict, custom_tags: Iterable = []) -> Union[pdm.PageXMLTextRegion, None]:
def parse_textregion(text_region_dict: dict, custom_tags: Iterable = None) -> Union[pdm.PageXMLTextRegion, None]:
text_region = pdm.PageXMLTextRegion(
doc_id=text_region_dict['@id'] if '@id' in text_region_dict else None,
orientation=float(text_region_dict['@orientation']) if '@orientation' in text_region_dict else None,
Expand Down Expand Up @@ -214,7 +253,7 @@ def parse_textregion(text_region_dict: dict, custom_tags: Iterable = []) -> Unio
return text_region


def parse_textregion_list(textregion_dict_list: list, custom_tags: Iterable = []) -> List[pdm.PageXMLTextRegion]:
def parse_textregion_list(textregion_dict_list: list, custom_tags: Iterable = None) -> List[pdm.PageXMLTextRegion]:
return [parse_textregion(textregion_dict, custom_tags) for textregion_dict in textregion_dict_list]


Expand Down Expand Up @@ -267,7 +306,7 @@ def parse_page_reading_order(page_json: dict) -> dict:
return reading_order


def parse_pagexml_json(pagexml_file: str, scan_json: dict, custom_tags: Iterable = []) -> pdm.PageXMLScan:
def parse_pagexml_json(pagexml_file: str, scan_json: dict, custom_tags: Iterable = None) -> pdm.PageXMLScan:
"""Parse a JSON/xmltodict representation of a PageXML file and return a PageXMLScan object."""
doc_id = pagexml_file
coords, text_regions = None, None
Expand Down Expand Up @@ -311,8 +350,8 @@ def read_pagexml_file(pagexml_file: str, encoding: str = 'utf-8') -> str:
return fh.read()


def parse_pagexml_file(pagexml_file: str, pagexml_data: Union[str, None] = None, custom_tags: Iterable = {},
encoding: str = 'utf-8') -> pdm.PageXMLScan:
def parse_pagexml_file(pagexml_file: str, pagexml_data: Union[str, None] = None,
custom_tags: Iterable = None, encoding: str = 'utf-8') -> pdm.PageXMLScan:
"""Read PageXML from file (or content of file passed separately if read from elsewhere,
e.g. tarball) and return a PageXMLScan object.
Expand Down
97 changes: 97 additions & 0 deletions tests/parser_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,5 +35,102 @@ def test_parsing_from_json_sets_parents(self):
self.assertEqual(True, line.parent == tr)


class TestCustomParser(unittest.TestCase):

def setUp(self) -> None:
custom = ("readingOrder {index:0;} abbrev {offset:0; length:4;} "
"unclear {offset:0; length:4; continued:true;} "
"unclear {offset:4; length:5; continued:true;} "
"abbrev {offset:9; length:9;} "
"unclear {offset:9; length:9; continued:true;} "
"textStyle {offset:20; length:1;superscript:true;} "
"madeup {offset:20; length:1;imaginary_attribute:true;} "
"unclear {offset:18; length:33; continued:true;}")
self.element = {
'@custom': custom
}

def test_parse_metadata_element_list_single_type_as_list(self):
metadata = parser.parse_custom_metadata_element_list(self.element['@custom'], 'readingOrder')
self.assertEqual(1, len(metadata))

def test_parse_metadata_element_list_multi_type_as_list(self):
metadata = parser.parse_custom_metadata_element_list(self.element['@custom'], 'unclear')
self.assertEqual(4, len(metadata))

def test_parse_custom_attribute_part_returns_dict(self):
attributes = parser.parse_custom_attribute_parts('offset:9; length:5')
expected = {'offset': 9, 'length': 5}
self.assertEqual(expected, attributes)

def test_parse_custom_attribute_part_returns_text_with_offset(self):
text = "this is a text"
attribute = parser.parse_custom_attribute_parts('offset:9; length:5', element_text=text)
attrib_text = text[9:9+5]
self.assertEqual(True, 'text' in attribute)
self.assertEqual(attrib_text, attribute['text'])

def test_parse_custom_attributes_returns_list(self):
attributes = parser.parse_custom_attributes('unclear {offset:9; length:5}')
expected = [{'offset': 9, 'length': 5, 'tag_name': 'unclear'}]
self.assertEqual(expected, attributes)

def test_parse_custom_attributes_handles_semicolon_at_the_end(self):
attributes = parser.parse_custom_attributes('unclear {offset:9; length:5;}')
expected = [{'offset': 9, 'length': 5, 'tag_name': 'unclear'}]
self.assertEqual(expected, attributes)

def test_parse_custom_attributes_handles_arbitrary_whitespace(self):
attributes = parser.parse_custom_attributes('unclear {offset: 9; length :5 ;}')
expected = [{'offset': 9, 'length': 5, 'tag_name': 'unclear'}]
self.assertEqual(expected, attributes)

def test_parse_custom_attributes_returns_list_with_repeated_elements(self):
custom_string = 'unclear {offset:9; length:5} unclear {offset:16; length: 2;}'
attributes = parser.parse_custom_attributes(custom_string)
expected = [
{'offset': 9, 'length': 5, 'tag_name': 'unclear'},
{'offset': 16, 'length': 2, 'tag_name': 'unclear'}
]
self.assertEqual(expected, attributes)

def test_parse_custom_metadata_extracts_structure(self):
custom = parser.parse_custom_metadata({"@custom": "structure {type: resolution}"})
print(custom)
self.assertEqual(True, 'structure' in custom)
expected = {'type': 'resolution'}
self.assertEqual(expected, custom['structure'])
expected = [{'type': 'resolution', 'tag_name': 'structure'}]
self.assertEqual(expected, custom['custom_attributes'])

def test_parse_custom_metadata_extracts_all_tag_types(self):
custom = parser.parse_custom_metadata(self.element, custom_tags=['unclear'])
print(custom['custom_attributes'])
tag_types = {'readingOrder', 'unclear', 'abbrev', 'textStyle', 'madeup'}
self.assertEqual(tag_types, {attr['tag_name'] for attr in custom['custom_attributes']})

def test_parse_custom_metadata_returns_text_with_offset(self):
text = "this is a text"
custom = parser.parse_custom_metadata(self.element, element_text=text)
attrib_text = text[0:4]
unclears = [attr for attr in custom['custom_attributes'] if attr['tag_name'] == 'unclear']
self.assertEqual(True, 'text' in unclears[0])
self.assertEqual(attrib_text, unclears[0]['text'])

def test_parse_custom_metadata_extracts_unique_tag_as_dict(self):
custom = parser.parse_custom_metadata(self.element)
self.assertEqual(True, isinstance(custom['reading_order'], dict))

def test_parse_custom_metadata_extracts_multiple_tags_of_same_type_as_list(self):
custom = parser.parse_custom_metadata(self.element)
unclear = [attr for attr in custom['custom_attributes'] if attr['tag_name'] == 'unclear']
self.assertEqual(4, len(unclear))

def test_parse_custom_metadata_extracts_all_tags(self):
custom = parser.parse_custom_metadata(self.element)
print(custom)
self.assertEqual(9, len(custom['custom_attributes']))


if __name__ == '__main__':
unittest.main()

0 comments on commit 834196f

Please sign in to comment.