From 962037ee5619fc0773b7a4aa2104fedab01a3c0b Mon Sep 17 00:00:00 2001 From: Marijn Koolen Date: Tue, 21 Nov 2023 17:30:35 +0100 Subject: [PATCH 1/3] Improve setting of doc_type and main_type - Make sure user-provided type is main type --- pagexml/model/physical_document_model.py | 88 ++++++++++++---- tests/physical_document_model_test.py | 122 ++++++++++++++--------- 2 files changed, 144 insertions(+), 66 deletions(-) diff --git a/pagexml/model/physical_document_model.py b/pagexml/model/physical_document_model.py index 761261e..e4cdd41 100644 --- a/pagexml/model/physical_document_model.py +++ b/pagexml/model/physical_document_model.py @@ -6,6 +6,7 @@ import numpy as np from scipy.spatial import ConvexHull +from scipy.spatial import QhullError def parse_points(points: Union[str, List[Tuple[int, int]]]) -> List[Tuple[int, int]]: @@ -303,17 +304,21 @@ def parse_derived_coords(document_list: list) -> Coords: def coords_list_to_hull_coords(coords_list): # print(coords_list) - points = np.array([point for coords in coords_list for point in coords.points]) + points = [point for coords in coords_list for point in coords.points] + points_array = np.array(points) # print(points) try: - edges = points_to_hull_edges(points) + edges = points_to_hull_edges(points_array) + # print(edges) + hull_points = edges_to_hull_points(edges) + return Coords(hull_points) except IndexError: print([coords for coords in coords_list]) print('points:', points) raise - # print(edges) - hull_points = edges_to_hull_points(edges) - return Coords(hull_points) + except QhullError: + print('points:', points) + return Coords([point for point in points]) def points_to_hull_edges(points): @@ -343,20 +348,31 @@ def edges_to_hull_points(edges): class StructureDoc: def __init__(self, doc_id: Union[None, str] = None, doc_type: Union[None, str, List[str]] = None, + main_type: Union[None, str] = None, metadata: Dict[str, any] = None, reading_order: Dict[int, str] = None): self.id = doc_id - self.type = doc_type - self.main_type = 'doc' + self.type = "structure_doc" self.metadata = metadata if metadata else {} - # if self.id and 'id' not in self.metadata: - # self.metadata['id'] = self.id - # if self.metadata and 'type' not in self.metadata: - # self.metadata['type'] = self.type + self.main_type = 'structure_doc' + if doc_type: + self.type = doc_type + if isinstance(doc_type, str): + self.main_type = main_type + if main_type: + self.main_type = main_type + if self.main_type: + self.set_main_type_id() + self.domain = None self.reading_order: Dict[int, str] = reading_order if reading_order else {} self.reading_order_number = {} self.parent: Union[StructureDoc, None] = None self.logical_parent: Union[StructureDoc, None] = None + def set_main_type_id(self): + main_type_id = f'{self.main_type}_id' + if self.main_type and self.id and main_type_id not in self.metadata: + self.metadata[main_type_id] = self.id + def set_parent(self, parent: StructureDoc): """Set parent document and add metadata of parent to this document's metadata""" self.parent = parent @@ -415,6 +431,7 @@ def json(self) -> Dict[str, any]: json_data = { 'id': self.id, 'type': self.type, + 'domain': self.domain, 'metadata': self.metadata } if self.reading_order: @@ -422,6 +439,30 @@ def json(self) -> Dict[str, any]: return json_data +def combine_doc_types(doc_type1: Union[str, List[str], None], + doc_type2: Union[str, List[str], None]): + if doc_type1 is None: + return doc_type2 + elif doc_type2 is None: + return doc_type1 + + # make a new list of combined type of doc_type1 so the original doc_type1 is unaffected + if isinstance(doc_type1, str): + combined_type = [doc_type1] + else: + combined_type = [dt for dt in doc_type1] + + # make sure doc_type2 is a list + if isinstance(doc_type2, str): + doc_type2 = [doc_type2] + + # add new doc 2 types to the combined type + for dt2 in doc_type2: + if dt2 not in doc_type1: + combined_type.append(dt2) + return combined_type + + class PhysicalStructureDoc(StructureDoc): def __init__(self, doc_id: str = None, @@ -429,13 +470,18 @@ def __init__(self, doc_id: str = None, metadata: Dict[str, any] = None, coords: Coords = None, reading_order: Dict[int, str] = None): - super().__init__(doc_id=doc_id, doc_type=doc_type, metadata=metadata, reading_order=reading_order) + super().__init__(doc_id=doc_id, doc_type='physical_structure_doc', metadata=metadata, reading_order=reading_order) self.coords: Union[None, Coords] = coords - self.main_type = 'physical_structure_doc' + if doc_type: + self.main_type = doc_type + self.set_main_type_id() + self.add_type(doc_type) + self.domain = 'physical' @property def json(self) -> Dict[str, any]: doc_json = super().json + doc_json['domain'] = self.domain if self.coords: doc_json['coords'] = self.coords.points return doc_json @@ -451,10 +497,15 @@ class LogicalStructureDoc(StructureDoc): def __init__(self, doc_id: str = None, doc_type: Union[str, List[str]] = None, metadata: Dict[str, any] = None, lines: List[PageXMLTextLine] = None, text_regions: List[PageXMLTextRegion] = None, reading_order: Dict[int, str] = None): - super().__init__(doc_id, doc_type, metadata, reading_order=reading_order) + super().__init__(doc_id, doc_type="logical_structure_doc", metadata=metadata, reading_order=reading_order) self.lines: List[PageXMLTextLine] = lines if lines else [] self.text_regions: List[PageXMLTextRegion] = text_regions if text_regions else [] self.logical_parent: Union[StructureDoc, None] = None + if doc_type: + self.add_type(doc_type) + self.main_type = doc_type + self.set_main_type_id() + self.domain = "logical" def set_logical_parent(self, parent: StructureDoc): """Set parent document and add metadata of parent to this document's metadata""" @@ -479,10 +530,11 @@ class PageXMLDoc(PhysicalStructureDoc): def __init__(self, doc_id: str = None, doc_type: Union[str, List[str]] = None, metadata: Dict[str, any] = None, coords: Coords = None, reading_order: Dict[int, str] = None): - super().__init__(doc_id=doc_id, doc_type="pagexml_doc", metadata=metadata, reading_order=reading_order) + if doc_type is None: + doc_type = 'pagexml_doc' + super().__init__(doc_id=doc_id, doc_type=doc_type, metadata=metadata, reading_order=reading_order) self.coords: Union[None, Coords] = coords - self.add_type(doc_type) - self.main_type = 'pagexml_doc' + self.add_type('pagexml_doc') @property def stats(self): @@ -494,7 +546,7 @@ class PageXMLWord(PageXMLDoc): def __init__(self, doc_id: str = None, doc_type: Union[str, List[str]] = None, metadata: Dict[str, any] = None, coords: Coords = None, conf: float = None, text: str = None): - super().__init__(doc_id, "word", metadata, coords) + super().__init__(doc_id=doc_id, doc_type="word", metadata=metadata, coords=coords) self.conf = conf self.text = text self.main_type = 'word' diff --git a/tests/physical_document_model_test.py b/tests/physical_document_model_test.py index 138b66c..5e4cc9b 100644 --- a/tests/physical_document_model_test.py +++ b/tests/physical_document_model_test.py @@ -1,12 +1,13 @@ import unittest from unittest.mock import Mock -from pagexml.model.physical_document_model import Coords, StructureDoc, PhysicalStructureDoc, LogicalStructureDoc +import pagexml.model.physical_document_model as pdm +# from pagexml.model.physical_document_model import pdm.Coords, pdm.StructureDoc, PhysicalStructureDoc, pdm.LogicalStructureDoc class TestCoords(unittest.TestCase): def test_valid_points(self): - coords = Coords([(0, 0), (1, 1), (2, 2)]) + coords = pdm.Coords([(0, 0), (1, 1), (2, 2)]) self.assertEqual([(0, 0), (1, 1), (2, 2)], coords.points) self.assertEqual(0, coords.left) self.assertEqual(0, coords.top) @@ -18,56 +19,80 @@ def test_valid_points(self): self.assertEqual({'type': 'coords', 'points': [(0, 0), (1, 1), (2, 2)]}, coords.json) def test_point_string(self): - coords = Coords([(0, 0), (1, 1), (2, 2)]) + coords = pdm.Coords([(0, 0), (1, 1), (2, 2)]) self.assertEqual('0,0 1,1 2,2', coords.point_string) def test_invalid_points(self): with self.assertRaises(ValueError): - coords = Coords('invalid points') + coords = pdm.Coords('invalid points') + + +class TestHullCoords(unittest.TestCase): + + def test_list_of_coords_to_hull_of_coords(self): + coords1 = pdm.Coords([(0, 0), (1, 1), (2, 2)]) + coords2 = pdm.Coords([(3, 8), (4, 7), (6, 5)]) + hull_coords = pdm.coords_list_to_hull_coords([coords1, coords2]) + x_points = [point[0] for point in coords1.points + coords2.points] + y_points = [point[1] for point in coords1.points + coords2.points] + self.assertEqual(hull_coords.left, min(x_points)) + self.assertEqual(hull_coords.right, max(x_points)) + self.assertEqual(hull_coords.top, min(y_points)) + self.assertEqual(hull_coords.bottom, max(y_points)) + + def test_list_of_line_point_coords_to_hull_of_coords(self): + # two points form a line, no convex hull + coords1 = pdm.Coords([(0, 0)]) + coords2 = pdm.Coords([(0, 5)]) + # hull coords should just be the two points + hull_coords = pdm.coords_list_to_hull_coords([coords1, coords2]) + for pi, point in enumerate(coords1.points + coords2.points): + with self.subTest(pi): + self.assertIn(point, hull_coords.points) class TestStructureDoc(unittest.TestCase): def test_init(self): - doc = StructureDoc() + doc = pdm.StructureDoc() self.assertIsNone(doc.id) - self.assertIsNone(doc.type) - self.assertEqual('doc', doc.main_type) + self.assertEqual('structure_doc', doc.type) + self.assertEqual('structure_doc', doc.main_type) self.assertEqual({}, doc.metadata) self.assertEqual({}, doc.reading_order) self.assertEqual({}, doc.reading_order_number) self.assertIsNone(doc.parent) def test_set_parent(self): - parent_doc = StructureDoc(doc_id='parent_doc') - child_doc = StructureDoc(doc_id='child_doc') + parent_doc = pdm.StructureDoc(doc_id='parent_doc') + child_doc = pdm.StructureDoc(doc_id='child_doc') child_doc.set_parent(parent_doc) self.assertEqual(parent_doc, child_doc.parent) - self.assertEqual('doc', child_doc.metadata['parent_type']) + self.assertEqual('structure_doc', child_doc.metadata['parent_type']) self.assertEqual('parent_doc', child_doc.metadata['parent_id']) def test_add_type(self): - doc = StructureDoc(doc_type='doc') + doc = pdm.StructureDoc(doc_type='structure_doc') # Add a new type doc.add_type('report') - self.assertEqual(['doc', 'report'], doc.type) + self.assertEqual(['structure_doc', 'report'], doc.type) # Add the same type twice - doc.add_type('doc') - self.assertEqual(['doc', 'report'], doc.type) + doc.add_type('structure_doc') + self.assertEqual(['structure_doc', 'report'], doc.type) # Add multiple types at once doc.add_type(['pdf', 'ocr']) - self.assertEqual(['doc', 'report', 'pdf', 'ocr'], doc.type) + self.assertEqual(['structure_doc', 'report', 'pdf', 'ocr'], doc.type) def test_remove_type(self): - doc = StructureDoc(doc_type=['doc', 'report']) + doc = pdm.StructureDoc(doc_type=['structure_doc', 'report']) # Remove an existing type - doc.remove_type('doc') + doc.remove_type('structure_doc') self.assertEqual('report', doc.type) # Remove a non-existing type @@ -79,48 +104,48 @@ def test_remove_type(self): self.assertEqual([], doc.type) def test_has_type(self): - doc = StructureDoc(doc_type=['doc', 'report']) + doc = pdm.StructureDoc(doc_type=['structure_doc', 'report']) # Check for an existing type - self.assertTrue(doc.has_type('doc')) + self.assertTrue(doc.has_type('structure_doc')) # Check for a non-existing type self.assertFalse(doc.has_type('pdf')) def test_types(self): - doc = StructureDoc(doc_type=['doc', 'report']) + doc = pdm.StructureDoc(doc_type=['structure_doc', 'report']) # Get all types - self.assertEqual({'doc', 'report'}, doc.types) + self.assertEqual({'structure_doc', 'report'}, doc.types) def test_set_as_parent(self): - parent_doc = StructureDoc(doc_id='parent_doc') - child_doc1 = StructureDoc(doc_id='child_doc1') - child_doc2 = StructureDoc(doc_id='child_doc2') + parent_doc = pdm.StructureDoc(doc_id='parent_doc') + child_doc1 = pdm.StructureDoc(doc_id='child_doc1') + child_doc2 = pdm.StructureDoc(doc_id='child_doc2') child_docs = [child_doc1, child_doc2] parent_doc.set_as_parent(child_docs) self.assertEqual(child_doc1.parent, parent_doc) self.assertEqual(child_doc2.parent, parent_doc) - self.assertEqual(child_doc1.metadata['parent_type'], 'doc') + self.assertEqual(child_doc1.metadata['parent_type'], 'structure_doc') self.assertEqual(child_doc1.metadata['parent_id'], 'parent_doc') - self.assertEqual(child_doc2.metadata['parent_type'], 'doc') + self.assertEqual(child_doc2.metadata['parent_type'], 'structure_doc') self.assertEqual(child_doc2.metadata['parent_id'], 'parent_doc') def test_add_parent_id_to_metadata(self): - parent_doc = StructureDoc(doc_id='parent_doc') - child_doc = StructureDoc(doc_id='child_doc') + parent_doc = pdm.StructureDoc(doc_id='parent_doc') + child_doc = pdm.StructureDoc(doc_id='child_doc') child_doc.set_parent(parent_doc) child_doc.add_parent_id_to_metadata() - self.assertEqual('doc', child_doc.metadata['parent_type']) + self.assertEqual('structure_doc', child_doc.metadata['parent_type']) self.assertEqual('parent_doc', child_doc.metadata['parent_id']) - self.assertEqual('parent_doc', child_doc.metadata['doc_id']) + self.assertEqual('parent_doc', child_doc.metadata['structure_doc_id']) def test_json(self): - doc = StructureDoc(doc_id='doc1', doc_type='book', metadata={'title': 'The Great Gatsby'}) + doc = pdm.StructureDoc(doc_id='doc1', doc_type='book', metadata={'title': 'The Great Gatsby'}) json_data = doc.json self.assertEqual('doc1', json_data['id']) self.assertEqual('book', json_data['type']) @@ -136,26 +161,27 @@ class TestPhysicalStructureDoc(unittest.TestCase): def setUp(self): self.metadata = {'author': 'Jane Doe'} - self.coords = Coords([(0, 0), (0, 10), (10, 10), (10, 0)]) - self.doc = PhysicalStructureDoc(doc_id='doc1', doc_type='book', metadata=self.metadata, coords=self.coords) + self.coords = pdm.Coords([(0, 0), (0, 10), (10, 10), (10, 0)]) + self.doc = pdm.PhysicalStructureDoc(doc_id='doc1', doc_type='book', metadata=self.metadata, coords=self.coords) def test_init(self): self.assertEqual('doc1', self.doc.id) - self.assertEqual('book', self.doc.type) + self.assertEqual(['physical_structure_doc', 'book'], self.doc.type) self.assertEqual(self.metadata, self.doc.metadata) self.assertEqual(self.coords, self.doc.coords) def test_set_derived_id(self): - parent = Mock(spec=StructureDoc) + parent = Mock(spec=pdm.StructureDoc) parent.id = 'parent_doc' self.doc.set_derived_id(parent.id) - self.assertEqual('parent_doc-physical_structure_doc-0-0-10-10', self.doc.id) + self.assertEqual('parent_doc-book-0-0-10-10', self.doc.id) def test_json(self): expected_json = { 'id': 'doc1', - 'type': 'book', - 'metadata': {'author': 'Jane Doe'}, + 'type': ['physical_structure_doc', 'book'], + 'domain': 'physical', + 'metadata': {'author': 'Jane Doe', 'book_id': 'doc1'}, 'coords': [(0, 0), (0, 10), (10, 10), (10, 0)] } self.assertEqual(expected_json, self.doc.json) @@ -163,7 +189,7 @@ def test_json(self): class TestLogicalStructureDoc(unittest.TestCase): def setUp(self): - self.doc = LogicalStructureDoc( + self.doc = pdm.LogicalStructureDoc( doc_id='doc_001', doc_type='article', metadata={'author': 'John Doe'}, @@ -172,7 +198,7 @@ def setUp(self): ) def test_set_logical_parent(self): - parent_doc = LogicalStructureDoc( + parent_doc = pdm.LogicalStructureDoc( doc_id='doc_002', doc_type='journal', metadata={'publisher': 'New York Times'}, @@ -182,14 +208,14 @@ def test_set_logical_parent(self): self.doc.set_logical_parent(parent_doc) self.assertEqual(self.doc.logical_parent, parent_doc) self.assertIn('logical_parent_type', self.doc.metadata) - self.assertEqual('doc', self.doc.metadata['logical_parent_type']) - self.assertIn('logical_parent_id', self.doc.metadata) + self.assertEqual('journal', self.doc.metadata['logical_parent_type']) + self.assertIn('article_id', self.doc.metadata) self.assertEqual('doc_002', self.doc.metadata['logical_parent_id']) - self.assertIn('doc_id', self.doc.metadata) - self.assertEqual('doc_002', self.doc.metadata['doc_id']) + self.assertIn('journal_id', self.doc.metadata) + self.assertEqual('doc_002', self.doc.metadata['journal_id']) def test_add_logical_parent_id_to_metadata(self): - parent_doc = LogicalStructureDoc( + parent_doc = pdm.LogicalStructureDoc( doc_id='doc_002', doc_type='journal', metadata={'publisher': 'New York Times'}, @@ -199,11 +225,11 @@ def test_add_logical_parent_id_to_metadata(self): self.doc.logical_parent = parent_doc self.doc.add_logical_parent_id_to_metadata() self.assertIn('logical_parent_type', self.doc.metadata) - self.assertEqual('doc', self.doc.metadata['logical_parent_type']) + self.assertEqual('journal', self.doc.metadata['logical_parent_type']) self.assertIn('logical_parent_id', self.doc.metadata) self.assertEqual('doc_002', self.doc.metadata['logical_parent_id']) - self.assertIn('doc_id', self.doc.metadata) - self.assertEqual('doc_002', self.doc.metadata['doc_id']) + self.assertIn('journal_id', self.doc.metadata) + self.assertEqual('doc_002', self.doc.metadata['journal_id']) if __name__ == '__main__': From bb7850d5695f3b9b672579e28237755e87134c4e Mon Sep 17 00:00:00 2001 From: Marijn Koolen Date: Tue, 21 Nov 2023 17:35:10 +0100 Subject: [PATCH 2/3] Remove id duplication in metadata --- pagexml/model/physical_document_model.py | 9 --------- tests/physical_document_model_test.py | 3 +-- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/pagexml/model/physical_document_model.py b/pagexml/model/physical_document_model.py index e4cdd41..8d423a3 100644 --- a/pagexml/model/physical_document_model.py +++ b/pagexml/model/physical_document_model.py @@ -360,19 +360,12 @@ def __init__(self, doc_id: Union[None, str] = None, doc_type: Union[None, str, L self.main_type = main_type if main_type: self.main_type = main_type - if self.main_type: - self.set_main_type_id() self.domain = None self.reading_order: Dict[int, str] = reading_order if reading_order else {} self.reading_order_number = {} self.parent: Union[StructureDoc, None] = None self.logical_parent: Union[StructureDoc, None] = None - def set_main_type_id(self): - main_type_id = f'{self.main_type}_id' - if self.main_type and self.id and main_type_id not in self.metadata: - self.metadata[main_type_id] = self.id - def set_parent(self, parent: StructureDoc): """Set parent document and add metadata of parent to this document's metadata""" self.parent = parent @@ -474,7 +467,6 @@ def __init__(self, doc_id: str = None, self.coords: Union[None, Coords] = coords if doc_type: self.main_type = doc_type - self.set_main_type_id() self.add_type(doc_type) self.domain = 'physical' @@ -504,7 +496,6 @@ def __init__(self, doc_id: str = None, doc_type: Union[str, List[str]] = None, if doc_type: self.add_type(doc_type) self.main_type = doc_type - self.set_main_type_id() self.domain = "logical" def set_logical_parent(self, parent: StructureDoc): diff --git a/tests/physical_document_model_test.py b/tests/physical_document_model_test.py index 5e4cc9b..e79b715 100644 --- a/tests/physical_document_model_test.py +++ b/tests/physical_document_model_test.py @@ -181,7 +181,7 @@ def test_json(self): 'id': 'doc1', 'type': ['physical_structure_doc', 'book'], 'domain': 'physical', - 'metadata': {'author': 'Jane Doe', 'book_id': 'doc1'}, + 'metadata': {'author': 'Jane Doe'}, 'coords': [(0, 0), (0, 10), (10, 10), (10, 0)] } self.assertEqual(expected_json, self.doc.json) @@ -209,7 +209,6 @@ def test_set_logical_parent(self): self.assertEqual(self.doc.logical_parent, parent_doc) self.assertIn('logical_parent_type', self.doc.metadata) self.assertEqual('journal', self.doc.metadata['logical_parent_type']) - self.assertIn('article_id', self.doc.metadata) self.assertEqual('doc_002', self.doc.metadata['logical_parent_id']) self.assertIn('journal_id', self.doc.metadata) self.assertEqual('doc_002', self.doc.metadata['journal_id']) From fce990b27b1512ef80e103e2e29a8c2a29ee22eb Mon Sep 17 00:00:00 2001 From: Marijn Koolen Date: Tue, 21 Nov 2023 17:48:16 +0100 Subject: [PATCH 3/3] Allow setting parent id in doc metadata This makes the API more consistent between physical and logical domains. --- pagexml/model/physical_document_model.py | 7 +++++++ tests/physical_document_model_test.py | 8 ++++++++ 2 files changed, 15 insertions(+) diff --git a/pagexml/model/physical_document_model.py b/pagexml/model/physical_document_model.py index 8d423a3..2c4b248 100644 --- a/pagexml/model/physical_document_model.py +++ b/pagexml/model/physical_document_model.py @@ -483,6 +483,13 @@ def set_derived_id(self, parent_id: str): self.id = f"{parent_id}-{self.main_type}-{box_string}" # self.metadata['id'] = self.id + def add_parent_id_to_metadata(self): + if self.parent: + self.metadata['parent_type'] = self.parent.main_type + self.metadata['parent_id'] = self.parent.id + if hasattr(self.parent, 'main_type') and self.parent.main_type is not None: + self.metadata[f'{self.parent.main_type}_id'] = self.parent.id + class LogicalStructureDoc(StructureDoc): diff --git a/tests/physical_document_model_test.py b/tests/physical_document_model_test.py index e79b715..e2cdfe9 100644 --- a/tests/physical_document_model_test.py +++ b/tests/physical_document_model_test.py @@ -176,6 +176,14 @@ def test_set_derived_id(self): self.doc.set_derived_id(parent.id) self.assertEqual('parent_doc-book-0-0-10-10', self.doc.id) + def test_add_parent_id_to_metadata(self): + child = pdm.PhysicalStructureDoc(doc_id='doc2', doc_type='chapter') + child.id = 'parent_doc' + self.doc.set_as_parent([child]) + self.doc.add_parent_id_to_metadata() + self.assertIn('book_id', child.metadata) + self.assertEqual('doc1', child.metadata['book_id']) + def test_json(self): expected_json = { 'id': 'doc1',