diff --git a/pagexml/analysis/layout_stats.py b/pagexml/analysis/layout_stats.py index 2c12980..c54ae53 100644 --- a/pagexml/analysis/layout_stats.py +++ b/pagexml/analysis/layout_stats.py @@ -96,7 +96,7 @@ def interpolate_baseline_points(points: List[Tuple[int, int]], def compute_points_distances(points1: List[Tuple[int, int]], points2: List[Tuple[int, int]], - step: int = 50): + step: int = 50) -> np.ndarray: if points1 is None or points2 is None: return np.array([]) b1_points = interpolate_baseline_points(points1, step=step) @@ -158,12 +158,15 @@ def compute_bounding_box_distances(line1: Union[pdm.PageXMLTextLine, List[pdm.Pa return distances -def average_baseline_height(line: Union[pdm.PageXMLTextLine, List[pdm.PageXMLTextLine]]) -> int: +def average_baseline_height(line: Union[pdm.PageXMLTextLine, List[pdm.PageXMLTextLine]], + debug: int = 0) -> int: """Compute the average (mean) baseline height for comparing lines that are not horizontally aligned. :param line: a TextLine or a list of adjacent lines :type line: PageXMLTextLine + :param debug: Boolean to show debug information or not + :type debug: bool :return: the average (mean) baseline height across all its baseline points :rtype: int """ @@ -179,7 +182,8 @@ def average_baseline_height(line: Union[pdm.PageXMLTextLine, List[pdm.PageXMLTex # segment contributes its average height times its width total_avg += segment_avg * abs(next_point[0] - curr_point[0]) if total_avg < 0: - print(f'total_avg: {total_avg}\n') + print(f'pagexml.analysis.layout_stats.average_baseline_height - ' + f'negative total_avg {total_avg} for line {line.id}\n') # average is total of average heights divided by total width x = sorted([point[0] for point in points]) @@ -189,8 +193,9 @@ def average_baseline_height(line: Union[pdm.PageXMLTextLine, List[pdm.PageXMLTex else: # this should not happen, but if it does, we need to calculate # the average differently, to avoid a division by zero error - print(f"total_avg={total_avg}") - print(f"baseline.points[-1][0]={points[-1][0]}") + if debug > 0: + print(f"total_avg={total_avg}") + print(f"baseline.points[-1][0]={points[-1][0]}") xcoords = [p[0] for p in points] left_x = min(xcoords) right_x = max(xcoords) @@ -200,7 +205,7 @@ def average_baseline_height(line: Union[pdm.PageXMLTextLine, List[pdm.PageXMLTex return int(total_avg) -def sort_coords_above_below_baseline(line: pdm.PageXMLTextLine, +def sort_coords_above_below_baseline(line: pdm.PageXMLTextLine, step: int = 50, debug: int = 0) -> Tuple[List[Tuple[int, int]], List[Tuple[int, int]]]: """Split the list of bounding polygon coordinates of a line in sets of points above and below @@ -209,6 +214,8 @@ def sort_coords_above_below_baseline(line: pdm.PageXMLTextLine, :param line: a PageXML text line :type line: PageXMLTextLine + :param step: number of pixels between interpolated points + :type step: int :param debug: the detail level of debug information (0 = none, higher is more) :type debug: int :return: two lists of bounding polygon points @@ -225,12 +232,14 @@ def sort_coords_above_below_baseline(line: pdm.PageXMLTextLine, return above_baseline, below_baseline if line.coords.left > line.baseline.right: return above_baseline, below_baseline - interpolated_baseline_points = [i for i in interpolate_baseline_points(line.baseline.points, step=50).items()] + interpolated_baseline_points = [i for i in interpolate_baseline_points(line.baseline.points, step=step).items()] if debug > 2: + print('sort_coords_above_below_baseline - line.id:', line.id) + print('sort_coords_above_below_baseline - line.coords.points:', line.coords.points) print('baseline_points:', line.baseline.points) print('interpolated_baseline_points:', interpolated_baseline_points) sorted_coord_points = sorted(line.coords.points, key=lambda p: p[0]) - if debug > 0: + if debug > 1: print('sorted_coord_points:', sorted_coord_points) print('len(sorted_coord_points):', len(sorted_coord_points)) if debug > 1: @@ -240,7 +249,7 @@ def sort_coords_above_below_baseline(line: pdm.PageXMLTextLine, for ci_b, curr_b in enumerate(interpolated_baseline_points): curr_bx, curr_by = curr_b next_b = interpolated_baseline_points[ci_b + 1] if ci_b + 1 < num_baseline_points else None - if debug > 0: + if debug > 1: print(f'sort_above_below - curr_b: {curr_b}') print('\tci_c:', ci_c, '\tnum_coord_points:', num_coord_points) if ci_c == num_coord_points: @@ -249,24 +258,29 @@ def sort_coords_above_below_baseline(line: pdm.PageXMLTextLine, curr_cx, curr_cy = curr_c if next_b and abs(next_b[0] - curr_cx) < abs(curr_b[0] - curr_cx): break - if debug > 0: + if debug > 1: print(f'sort_above_below - curr_c ({ci_c}): {curr_c}') ci_c += 1 if curr_cy < curr_by: - if debug > 0: + if debug > 1: print(f'sort_above_below - above') above_baseline.append(curr_c) else: - if debug > 0: + if debug > 1: print(f'sort_above_below - below') below_baseline.append(curr_c) + if debug > 2: + print('sort_coords_above_below_baseline - above_baseline:', above_baseline) + print('sort_coords_above_below_baseline - below_baseline:', below_baseline) return above_baseline, below_baseline def get_text_heights(line: pdm.PageXMLTextLine, step: int = 50, ignore_errors: bool = True, debug: int = 0) -> np.array: - above_baseline, below_baseline = sort_coords_above_below_baseline(line, debug=debug) + if line.baseline.width <= step: + step = 5 + above_baseline, below_baseline = sort_coords_above_below_baseline(line, step=step, debug=debug) if len(above_baseline) == 0: if ignore_errors is False: ValueError(f'line {line.id} has no bounding coordinates above baseline') @@ -276,6 +290,10 @@ def get_text_heights(line: pdm.PageXMLTextLine, step: int = 50, ValueError(f'Warning: line {line.id} has no bounding coordinates below baseline') int_base = interpolate_baseline_points(line.baseline.points, step=step) int_above = interpolate_baseline_points(above_baseline, step=step) + if debug > 1: + print('get_text_heights - line.id:', line.id) + print('get_text_heights - int_base:', int_base) + print('get_text_heights - int_above:', int_above) height = {} for x in int_base: @@ -327,7 +345,7 @@ def get_line_distances(lines: List[pdm.PageXMLTextLine]) -> List[np.ndarray]: else: distances = compute_bounding_box_distances(curr_line, next_line) all_distances.append(distances) - return all_distances + return all_distances def get_textregion_line_distances(text_region: pdm.PageXMLTextRegion) -> List[np.ndarray]: @@ -593,7 +611,7 @@ def get_line_widths(pagexml_files: List[Union[str, pdm.PageXMLTextRegion]] = Non def find_line_width_boundary_points(line_widths: List[int], line_bin_size: int = 50, - min_ratio: float = 0.25) -> List[int]: + min_ratio: float = 0.25, debug: int = 0) -> List[int]: """Find the minima in the distribution of line widths relative to the peaks in the distribution. These minima represent the boundaries between clusters of lines within the same line width intervals. @@ -619,29 +637,37 @@ def find_line_width_boundary_points(line_widths: List[int], line_bin_size: int = curr_max_width = None curr_min_width = None prev_freq = 0 + if debug > 0: + print(f"find_line_width_boundary_points - total_widths: {total_widths}") + print(f"find_line_width_boundary_points - max_width: {max_width}") + print(f"find_line_width_boundary_points - max_freq: {max_freq}") for w in range(0, max_width + 1, line_bin_size): f = width_freq[w] if f > curr_max_freq: - # print(f'\tfreq {f} bigger than curr max: {curr_max_freq}') + if debug > 0: + print(f'\tfreq {f} bigger than curr max: {curr_max_freq}') curr_max_freq = f curr_max_width = w if f < prev_freq and f < curr_min_freq: - # print(f'\twidth: {w}\tfreq {f} smaller than prev freq: {prev_freq} and than curr min {curr_min_freq}') + if debug > 0: + print(f'\twidth: {w}\tfreq {f} smaller than prev freq: {prev_freq} and than curr min {curr_min_freq}') curr_min_freq = f curr_min_width = w if f / num_lines > 0.01 and f > prev_freq and f > curr_min_freq: - # print(f'\twidth: {w}\tfreq {f} bigger than prev freq: {prev_freq} and than curr min {curr_min_freq}') - # if prev_freq > 0 and f / prev_freq > 1.2 and (curr_max_freq - curr_min_freq) / curr_max_freq > min_ratio: - # print('\t\tRatio:', (curr_max_freq - curr_min_freq) / curr_max_freq) + if debug > 0: + print(f'\twidth: {w}\tfreq {f} bigger than prev freq: {prev_freq} and than curr min {curr_min_freq}') + # if prev_freq > 0 and f / prev_freq > 1.2 and (curr_max_freq - curr_min_freq) / curr_max_freq > min_ratio: + print('\t\tRatio:', (curr_max_freq - curr_min_freq) / curr_max_freq) if (curr_max_freq - curr_min_freq) / curr_max_freq > min_ratio: boundary_points.append((curr_min_width, curr_min_freq)) curr_max_freq = 0 curr_max_width = 0 curr_min_freq = max_freq + 1 - # print(f"width: {w: >5}\tfreq: {f: >8}\tprev_freq: {prev_freq: >8}" - # f"\tcurr_min_freq: {curr_min_freq: >8}" - # f"\tcurr_max_freq: {curr_max_freq}\tboundary points: {boundary_points}") + if debug > 0: + print(f"width: {w: >5}\tfreq: {f: >8}\tprev_freq: {prev_freq: >8}" + f"\tcurr_min_freq: {curr_min_freq: >8}" + f"\tcurr_max_freq: {curr_max_freq}\tboundary points: {boundary_points}") prev_freq = f return [bp[0] for bp in boundary_points] diff --git a/pagexml/analysis/stats.py b/pagexml/analysis/stats.py index 28a77d5..432638c 100644 --- a/pagexml/analysis/stats.py +++ b/pagexml/analysis/stats.py @@ -7,6 +7,9 @@ import pagexml.model.physical_document_model as pdm +DEFAULT_ELEMENTS = ['lines', 'words', 'text_regions', 'columns', 'extra', 'pages'] + + def derive_boundary_points(pagexml_doc: pdm.PageXMLTextRegion) -> List[int]: bin_width = pagexml_doc.coords.width / 5 return [point for point in np.arange(bin_width, pagexml_doc.coords.width, bin_width)] @@ -17,11 +20,14 @@ def _init_doc_stats(line_width_boundary_points: List[int], fields = ['doc_id', 'doc_num', 'doc_width', 'doc_height', 'lines', 'words', 'text_regions', 'columns', 'extra', 'pages', - 'num_words', 'num_number_words', 'num_title_words', 'num_non_title_words', + 'num_words', 'num_alpha_words', 'num_number_words', + 'num_title_words', 'num_non_title_words', 'num_stop_words', 'num_punctuation_words', 'num_oversized_words'] doc_stats = {field: [] for field in fields} for cat_wpl in text_stats.wpl_cat_range: doc_stats[f"words_per_line_{text_stats.wpl_cat_range[cat_wpl]}"] = [] + for cat_wpl in text_stats.wpl_cat_range: + doc_stats[f"alpha_words_per_line_{text_stats.wpl_cat_range[cat_wpl]}"] = [] for length_bin in range(word_length_bin_size, max_word_length + 1, word_length_bin_size): doc_stats[f"num_words_length_{length_bin}"] = [] for width_range in layout_stats.get_boundary_width_ranges(line_width_boundary_points): @@ -70,19 +76,23 @@ def get_doc_stats(pagexml_docs: Union[pdm.PageXMLTextRegion, List[pdm.PageXMLTex for pi, pagexml_doc in enumerate(pagexml_docs): pagexml_doc_stats['doc_id'].append(pagexml_doc.id) pagexml_doc_stats['doc_num'].append(pi + 1) - pagexml_doc_stats['doc_width'].append(pagexml_doc.coords.width) - pagexml_doc_stats['doc_height'].append(pagexml_doc.coords.height) + pagexml_doc_stats['doc_width'].append(pagexml_doc.coords.width if pagexml_doc.coords else None) + pagexml_doc_stats['doc_height'].append(pagexml_doc.coords.height if pagexml_doc.coords else None) lines = [line for line in pagexml_doc.get_lines() if line.text is not None] words = text_stats.get_doc_words(pagexml_doc, use_re_word_boundaries=use_re_word_boundaries) word_stats = text_stats.get_word_cat_stats(words, stop_words=stop_words, max_word_length=max_word_length) - wpl_stats = text_stats.get_words_per_line(lines) - for field in pagexml_doc.stats: - pagexml_doc_stats[field].append(pagexml_doc.stats[field]) + wpl_stats = text_stats.get_words_per_line(lines, alpha_words_only=False) + awpl_stats = text_stats.get_words_per_line(lines, alpha_words_only=True) + # for field in pagexml_doc.stats: + for field in DEFAULT_ELEMENTS: + pagexml_doc_stats[field].append(pagexml_doc.stats[field] if field in pagexml_doc.stats else 0) for word_cat in word_stats: pagexml_doc_stats[word_cat].append((word_stats[word_cat])) for wpl_cat in text_stats.wpl_cat_range.values(): pagexml_doc_stats[f'words_per_line_{wpl_cat}'].append(wpl_stats[wpl_cat]) + for wpl_cat in text_stats.wpl_cat_range.values(): + pagexml_doc_stats[f'alpha_words_per_line_{wpl_cat}'].append(awpl_stats[wpl_cat]) if line_width_boundary_points is None: bin_width = pagexml_doc.coords.width / 5 line_width_boundary_points = [point for point in np.arange(bin_width, pagexml_doc.coords.width, bin_width)] diff --git a/pagexml/analysis/text_stats.py b/pagexml/analysis/text_stats.py index f3b08f1..3b18487 100644 --- a/pagexml/analysis/text_stats.py +++ b/pagexml/analysis/text_stats.py @@ -927,13 +927,16 @@ def get_typical_start_end_words(wbd: WordBreakDetector, return typical_start_words, typical_end_words -def get_words_per_line(lines: List[pdm.PageXMLTextLine], use_re_word_boundaries: bool = False): +def get_words_per_line(lines: List[pdm.PageXMLTextLine], use_re_word_boundaries: bool = False, + alpha_words_only: bool = False): """Return a Counter of the number of words per line of a PageXML pagexml_doc object. :param lines: a list of PageXMLTextLine objects :type lines: List[PageXMLTextLine] :param use_re_word_boundaries: whether to split words of a line using RegEx word boundaries :type use_re_word_boundaries: bool + :param alpha_words_only: whether to only count words consisting of alpha characters (e.g. no numbers) + :type alpha_words_only: bool :return: a counter of the number of words per line of a pagexml_doc :rtype: Counter """ @@ -944,9 +947,12 @@ def get_words_per_line(lines: List[pdm.PageXMLTextLine], use_re_word_boundaries: if line.text is None or line.text == '': words = [] elif use_re_word_boundaries: - words = [w.replace(' ', '') for w in re.split(r'\b', line.text) if w != ' ' and w != ''] + words = [w.replace(' ', '') for w in re.split(r'\b', line.text)] else: words = [w for w in line.text.split(' ')] + words = [w for w in words if w != ' ' and w != ''] + if alpha_words_only is True: + words = [w for w in words if w.isalpha()] # words_per_line.update([len(words)]) if len(words) in wpl_to_cat: wpl_cat = wpl_to_cat[len(words)] @@ -991,6 +997,7 @@ def get_word_cat_stats(words, stop_words=None, max_word_length: int = 30, word_length_freq = Counter([len(w) for w in words if len(w) <= max_word_length]) word_cat_stats = { 'num_words': len(words), + 'num_alpha_words': len([w for w in words if w.isalpha()]), 'num_number_words': len([w for w in words if w.isdigit()]), 'num_title_words': len([w for w in words if w.istitle()]), 'num_non_title_words': len([w for w in words if w.istitle() is False]), diff --git a/pagexml/column_parser.py b/pagexml/column_parser.py index 63ea874..0630510 100644 --- a/pagexml/column_parser.py +++ b/pagexml/column_parser.py @@ -79,7 +79,7 @@ def column_bounding_box_surrounds_lines(column: pdm.PageXMLColumn) -> bool: """Check if the column coordinates contain the coordinate boxes of the column lines.""" for line in column.get_lines(): - if not pagexml_helper.elements_overlap(column, line, threshold=0.6): + if not pagexml_helper.regions_overlap(column, line, threshold=0.6): return False return True diff --git a/pagexml/helper/pagexml_helper.py b/pagexml/helper/pagexml_helper.py index 64e5a13..d66cf17 100644 --- a/pagexml/helper/pagexml_helper.py +++ b/pagexml/helper/pagexml_helper.py @@ -3,6 +3,7 @@ import re import string from collections import Counter +from enum import Enum from typing import Dict, Generator, List, Set, Tuple, Union import numpy as np @@ -13,22 +14,65 @@ import pagexml.model.physical_document_model as pdm -def elements_overlap(element1: pdm.PageXMLDoc, element2: pdm.PageXMLDoc, - threshold: float = 0.5) -> bool: - """Check if two elements have overlapping coordinates.""" - v_overlap = pdm.get_vertical_overlap(element1, element2) - h_overlap = pdm.get_horizontal_overlap(element1, element2) - if v_overlap / element1.coords.height > threshold: - if h_overlap / element1.coords.width > threshold: - return True - if v_overlap / element2.coords.height > threshold: - if h_overlap / element2.coords.width > threshold: - return True +def is_point_inside(point: Tuple[int, int], element: pdm.PageXMLDoc) -> bool: + x, y = point + if x < element.coords.left or x > element.coords.right: + return False + if y < element.coords.top or y > element.coords.bottom: + return False + return True + + +class RegionType(Enum): + + POINT = 1 + HLINE = 2 + VLINE = 3 + BOX = 4 + + +def get_region_type(element: pdm.PageXMLDoc) -> RegionType: + if element.coords.height == 0: + if element.coords.width == 0: + return RegionType.POINT else: - return False + return RegionType.HLINE + elif element.coords.width == 0: + return RegionType.VLINE else: + return RegionType.BOX + + +def same_point(point1: Tuple[int, int], point2: Tuple[int, int]) -> bool: + """Check if two points are the same.""" + return point1[0] == point2[0] and point1[1] == point2[1] + + +def regions_overlap(region1: pdm.PageXMLDoc, region2: pdm.PageXMLDoc, + threshold: float = 0.5) -> bool: + """Check if two regions have overlapping coordinates. + + Assumption: points are pixels, so regions with at least one point have at least + a width, height and area of 1.""" + if region1.coords is None or region2.coords is None: return False + height1 = region1.coords.height + 1 + width1 = region1.coords.width + 1 + height2 = region2.coords.height + 1 + width2 = region2.coords.width + 1 + + v_overlap = pdm.get_vertical_overlap(region1, region2) + h_overlap = pdm.get_horizontal_overlap(region1, region2) + + if v_overlap / height1 > threshold: + if h_overlap / width1 > threshold: + return True + if v_overlap / height2 > threshold: + if h_overlap / width2 > threshold: + return True + return False + def sort_regions_in_reading_order(doc: pdm.PageXMLDoc) -> List[pdm.PageXMLTextRegion]: """Sort text regions in reading order. If an explicit reading order is given, diff --git a/pagexml/model/physical_document_model.py b/pagexml/model/physical_document_model.py index 65acda3..ab5795b 100644 --- a/pagexml/model/physical_document_model.py +++ b/pagexml/model/physical_document_model.py @@ -7,6 +7,7 @@ import numpy as np from scipy.spatial import ConvexHull from scipy.spatial import QhullError +from shapely.geometry import Polygon def parse_points(points: Union[str, List[Tuple[int, int]]]) -> List[Tuple[int, int]]: @@ -30,6 +31,7 @@ def parse_points(points: Union[str, List[Tuple[int, int]]]) -> List[Tuple[int, i class Coords: def __init__(self, points: Union[str, List[Tuple[int, int]]]): + """Coordinates of a PageXML region based on a set of points.""" self.points: List[Tuple[int, int]] = parse_points(points) self.point_string = " ".join( ",".join([str(point[0]), str(point[1])]) for point in self.points @@ -149,13 +151,13 @@ def get_horizontal_overlap(doc1: PageXMLDoc, doc2: PageXMLDoc) -> int: else: overlap_left = max([doc1.coords.left, doc2.coords.left]) overlap_right = min([doc1.coords.right, doc2.coords.right]) - return overlap_right - overlap_left if overlap_right > overlap_left else 0 + return overlap_right - overlap_left + 1 if overlap_right >= overlap_left else 0 def get_vertical_overlap(doc1: PageXMLDoc, doc2: PageXMLDoc) -> int: overlap_top = max([doc1.coords.top, doc2.coords.top]) overlap_bottom = min([doc1.coords.bottom, doc2.coords.bottom]) - return overlap_bottom - overlap_top if overlap_bottom > overlap_top else 0 + return overlap_bottom - overlap_top + 1 if overlap_bottom >= overlap_top else 0 def is_vertically_overlapping(region1: PageXMLDoc, @@ -299,34 +301,40 @@ def sort_lines(line1: PageXMLTextLine, line2: PageXMLTextLine, as_column: bool = def parse_derived_coords(document_list: list) -> Coords: """Derive scan coordinates for a composite document based on the list of documents it contains. A convex hull is drawn around all points of all contained documents.""" - return coords_list_to_hull_coords([document.coords for document in document_list]) + try: + return coords_list_to_hull_coords([document.coords for document in document_list]) + except (IndexError, QhullError) as err: + print('pagexml.model.physical_document_model.parse_derived_coords - ' + 'Error with coords in list of documents with the following ids:\n', + [doc.id for doc in document_list]) + raise def coords_list_to_hull_coords(coords_list): # print(coords_list) points = [point for coords in coords_list for point in coords.points] - points_array = np.array(points) + if len(points) <= 2: + return Coords(points) # print(points) try: - edges = points_to_hull_edges(points_array) + edges = points_to_hull_edges(points) # print(edges) hull_points = edges_to_hull_points(edges) return Coords(hull_points) - except IndexError: - print([coords for coords in coords_list]) - print('points:', points) + except (IndexError, QhullError): + print('pagexml.model.physical_document_model.coords_list_to_hull_coords - IndexError') + print('coords in coords_list:', [coords for coords in coords_list]) + print('points derived from list of coords:', points) raise - except QhullError: - print('points:', points) - return Coords([point for point in points]) -def points_to_hull_edges(points): - hull = ConvexHull(points) +def points_to_hull_edges(points: List[Tuple[int, int]]): + points_array = np.array(points) + hull = ConvexHull(points_array) edges = defaultdict(dict) for simplex in hull.simplices: - p1 = (int(points[simplex, 0][0]), int(points[simplex, 1][0])) - p2 = (int(points[simplex, 0][1]), int(points[simplex, 1][1])) + p1 = (int(points_array[simplex, 0][0]), int(points_array[simplex, 1][0])) + p2 = (int(points_array[simplex, 0][1]), int(points_array[simplex, 1][1])) edges[p2][p1] = 1 edges[p1][p2] = 1 return edges @@ -354,11 +362,11 @@ def __init__(self, doc_id: Union[None, str] = None, doc_type: Union[None, str, L self.type = "structure_doc" self.metadata = metadata if metadata else {} self.main_type = 'structure_doc' - if doc_type: - self.type = doc_type + if doc_type is not None: + self.add_type(doc_type) if isinstance(doc_type, str): - self.main_type = main_type - if main_type: + self.main_type = doc_type + if main_type is not None: self.main_type = main_type self.domain = None self.reading_order: Dict[int, str] = reading_order if reading_order else {} @@ -375,6 +383,8 @@ def add_type(self, doc_type: Union[str, List[str]]) -> None: doc_types = [doc_type] if isinstance(doc_type, str) else doc_type if isinstance(self.type, str): self.type = [self.type] + elif isinstance(self.type, set): + self.type = list(self.type) for doc_type in doc_types: if doc_type not in self.type: self.type.append(doc_type) @@ -383,6 +393,8 @@ def remove_type(self, doc_type: Union[str, List[str]]) -> None: doc_types = [doc_type] if isinstance(doc_type, str) else doc_type if isinstance(self.type, str): self.type = [self.type] + elif isinstance(self.type, set): + self.type = list(self.type) for doc_type in doc_types: if doc_type in self.type: self.type.remove(doc_type) @@ -423,7 +435,8 @@ def add_parent_id_to_metadata(self): def json(self) -> Dict[str, any]: json_data = { 'id': self.id, - 'type': self.type, + 'type': list(self.type) if isinstance(self.type, set) else self.type, + 'main_type': self.main_type, 'domain': self.domain, 'metadata': self.metadata } @@ -465,11 +478,27 @@ def __init__(self, doc_id: str = None, reading_order: Dict[int, str] = None): super().__init__(doc_id=doc_id, doc_type='physical_structure_doc', metadata=metadata, reading_order=reading_order) self.coords: Union[None, Coords] = coords + self._area = None if doc_type: self.main_type = doc_type self.add_type(doc_type) self.domain = 'physical' + @property + def area(self): + """Returns the size of the area represented by the convex hull of the coordinates. + + The area is calculated the first time this function is called and stored in a + private property for later calls. The reason to not call it at object instantiation + is that it probably not often needed and only computing it when needed is more + efficient.""" + if self._area is None: + if self.coords is None: + self._area = 0 + else: + self._area = poly_area(self.coords.points) + return self._area + @property def json(self) -> Dict[str, any]: doc_json = super().json @@ -491,6 +520,28 @@ def add_parent_id_to_metadata(self): self.metadata[f'{self.parent.main_type}_id'] = self.parent.id +def poly_area(points: List[Tuple[int, int]]): + """Compute the surface area of a polygon represented by a set of Points.""" + if points is None: + return 0 + if len(points) <= 2: + # two points represent a line, which has an area of zero + return 0 + hull_points = points_to_hull_edges(points) + polygon = Polygon(hull_points) + return polygon.area + + +class EmptyRegionDoc(PhysicalStructureDoc): + + def __init__(self, doc_id: str = None, doc_type: str = None, metadata: Dict[str, any] = None, + coords: Coords = None): + super().__init__(doc_id=doc_id, doc_type=doc_type, metadata=metadata, coords=coords) + self.add_type('empty') + if doc_type is None: + self.main_type = 'empty' + + class LogicalStructureDoc(StructureDoc): def __init__(self, doc_id: str = None, doc_type: Union[str, List[str]] = None, @@ -689,6 +740,7 @@ def __init__(self, doc_id: str = None, doc_type: Union[str, List[str]] = None, self.set_text_regions_in_reader_order() if doc_type: self.add_type(doc_type) + self.empty_regions = [] def __repr__(self): stats = json.dumps(self.stats) diff --git a/pyproject.toml b/pyproject.toml index 211baf5..bd8db5d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,7 +19,7 @@ packages = [{ include = "pagexml" }] [tool.poetry.dependencies] python = ">=3.8 <4.0" -fuzzy-search = "^2.0.0a" +fuzzy-search = "^2.1.0" matplotlib = ">=3.7.0" numpy = ">=1.22.3" pandas = ">=1.5.3 <3.0.0" @@ -30,6 +30,7 @@ scipy = ">=1.7.0" seaborn = "^0.13.0" tqdm = "^4.64.1" xmltodict = "^0.13.0" +shapely = "^2.0.3" [tool.poetry.dev-dependencies] diff --git a/tests/helper-pagexml_helper_test.py b/tests/helper-pagexml_helper_test.py index 55cd9fc..b078007 100644 --- a/tests/helper-pagexml_helper_test.py +++ b/tests/helper-pagexml_helper_test.py @@ -1,16 +1,118 @@ import unittest +from typing import List, Tuple +import pagexml.helper.pagexml_helper as helper +import pagexml.model.physical_document_model as pdm from pagexml.parser import parse_pagexml_file +def make_region(points: List[Tuple[int, int]], doc_id: str = 'doc') -> pdm.PageXMLTextRegion: + coords = pdm.Coords(points) + return pdm.PageXMLTextRegion(doc_id=doc_id, coords=coords) + + +class TestRegionType(unittest.TestCase): + + def test_point(self): + tr = make_region([(1, 1)]) + self.assertEqual(helper.RegionType.POINT, helper.get_region_type(tr)) + + def test_hline(self): + tr = make_region([(1, 1), (2, 1)]) + self.assertEqual(helper.RegionType.HLINE, helper.get_region_type(tr)) + + def test_vline(self): + tr = make_region([(1, 1), (1, 2)]) + self.assertEqual(helper.RegionType.VLINE, helper.get_region_type(tr)) + + def test_box(self): + tr = make_region([(1, 1), (2, 2)]) + self.assertEqual(helper.RegionType.BOX, helper.get_region_type(tr)) + + class TestPageXMLHelper(unittest.TestCase): def setUp(self) -> None: + no_coords = None + point_coords1 = pdm.Coords([(1, 1)]) + point_coords2 = pdm.Coords([(1, 1)]) + point_coords3 = pdm.Coords([(2, 2)]) + hline_coords1 = pdm.Coords([(0, 0), (10, 0)]) + hline_coords2 = pdm.Coords([(5, 0), (15, 0)]) + hline_coords3 = pdm.Coords([(0, 5), (10, 5)]) + vline_coords1 = pdm.Coords([(0, 0), (0, 10)]) + vline_coords2 = pdm.Coords([(0, 5), (0, 15)]) + vline_coords3 = pdm.Coords([(5, 0), (5, 10)]) + self.no_coords_region = pdm.PageXMLTextRegion(doc_id='no_coords') + self.point_coords_region1 = pdm.PageXMLTextRegion(doc_id='point_coords1', coords=point_coords1) self.page_file = 'data/example.xml' self.page_doc = parse_pagexml_file(self.page_file) - def test_something(self): - self.assertEqual(True, 1 == 1) + def test_element_overlap_no_coords(self): + tr1 = make_region([(1, 1)]) + tr2 = pdm.PageXMLTextRegion(doc_id='no_coords') + self.assertEqual(False, helper.regions_overlap(tr1, tr2)) + self.assertEqual(False, helper.regions_overlap(tr2, tr1)) + + def test_element_overlap_same_points(self): + tr1 = make_region([(1, 1)]) + tr2 = make_region([(1, 1)]) + self.assertEqual(True, helper.regions_overlap(tr1, tr2)) + self.assertEqual(True, helper.regions_overlap(tr2, tr1)) + + def test_element_overlap_different_points(self): + tr1 = make_region([(1, 1)]) + tr2 = make_region([(1, 2)]) + self.assertEqual(False, helper.regions_overlap(tr1, tr2)) + self.assertEqual(False, helper.regions_overlap(tr2, tr1)) + + def test_element_overlap_point_on_horizontal_line(self): + tr1 = make_region([(5, 1)]) + tr2 = make_region([(1, 1), (10, 1)]) + self.assertEqual(True, helper.regions_overlap(tr1, tr2)) + self.assertEqual(True, helper.regions_overlap(tr2, tr1)) + + def test_element_overlap_point_on_vertical_line(self): + tr1 = make_region([(1, 5)]) + tr2 = make_region([(1, 1), (1, 10)]) + self.assertEqual(True, helper.regions_overlap(tr1, tr2)) + self.assertEqual(True, helper.regions_overlap(tr2, tr1)) + + def test_element_overlap_point_not_on_horizontal_line(self): + tr1 = make_region([(5, 2)]) + tr2 = make_region([(1, 1), (10, 1)]) + self.assertEqual(False, helper.regions_overlap(tr1, tr2)) + self.assertEqual(False, helper.regions_overlap(tr2, tr1)) + + def test_element_overlap_point_not_on_vertical_line(self): + tr1 = make_region([(2, 5)]) + tr2 = make_region([(1, 1), (1, 10)]) + self.assertEqual(False, helper.regions_overlap(tr1, tr2)) + self.assertEqual(False, helper.regions_overlap(tr2, tr1)) + + def test_element_overlap_point_inside_box(self): + tr1 = make_region([(5, 5)]) + tr2 = make_region([(0, 0), (10, 0), (10, 10), (0, 10)]) + self.assertEqual(True, helper.regions_overlap(tr1, tr2)) + self.assertEqual(True, helper.regions_overlap(tr2, tr1)) + + def test_element_overlap_point_outside_box(self): + tr1 = make_region([(5, 15)]) + tr2 = make_region([(0, 0), (10, 0), (10, 10), (0, 10)]) + self.assertEqual(False, helper.regions_overlap(tr1, tr2)) + self.assertEqual(False, helper.regions_overlap(tr2, tr1)) + + def test_element_overlap_horizontal_line_through_box(self): + tr1 = make_region([(5, 5), (5, 15)]) + tr2 = make_region([(0, 0), (10, 0), (10, 10), (0, 10)]) + self.assertEqual(True, helper.regions_overlap(tr1, tr2)) + self.assertEqual(True, helper.regions_overlap(tr2, tr1)) + + def test_element_overlap_horizontal_line_outside_box(self): + tr1 = make_region([(5, 15), (5, 20)]) + tr2 = make_region([(0, 0), (10, 0), (10, 10), (0, 10)]) + self.assertEqual(False, helper.regions_overlap(tr1, tr2)) + self.assertEqual(False, helper.regions_overlap(tr2, tr1)) if __name__ == '__main__': diff --git a/tests/physical_document_model_test.py b/tests/physical_document_model_test.py index 75fb037..75edb63 100644 --- a/tests/physical_document_model_test.py +++ b/tests/physical_document_model_test.py @@ -1,3 +1,4 @@ +import math import unittest from unittest.mock import Mock @@ -23,7 +24,7 @@ def test_point_string(self): def test_invalid_points(self): with self.assertRaises(ValueError): - coords = pdm.Coords('invalid points') + pdm.Coords('invalid points') class TestHullCoords(unittest.TestCase): @@ -73,6 +74,19 @@ def test_valid_points_from_str(self): self.assertEqual({'x': 795, 'y': 1109, 'w': 427, 'h': 138}, coords.box) +class TestHelperFunctions(unittest.TestCase): + + def test_poly_area_correctly_calculates_square_area(self): + side = 50 + square_points = [(0, 0), (0, side), (side, side), (side, 0)] + self.assertEqual(side**2, pdm.poly_area(square_points)) + + def test_poly_area_ignores_inner_points(self): + side = 50 + square_points = [(0, 0), (0, side), (side, side), (side, 0), (side/2, side/2)] + self.assertEqual(side**2, pdm.poly_area(square_points)) + + class TestStructureDoc(unittest.TestCase): def test_init(self): @@ -168,9 +182,11 @@ def test_add_parent_id_to_metadata(self): def test_json(self): doc = pdm.StructureDoc(doc_id='doc1', doc_type='book', metadata={'title': 'The Great Gatsby'}) + print('TEST_JSON - doc.main_type:', doc.main_type) json_data = doc.json self.assertEqual('doc1', json_data['id']) - self.assertEqual('book', json_data['type']) + self.assertIn('book', json_data['type']) + self.assertEqual('book', json_data['main_type']) self.assertEqual({'title': 'The Great Gatsby'}, json_data['metadata']) self.assertEqual({}, json_data.get('reading_order', {})) @@ -188,7 +204,7 @@ def setUp(self): def test_init(self): self.assertEqual('doc1', self.doc.id) - self.assertEqual(['physical_structure_doc', 'book'], self.doc.type) + self.assertEqual(['structure_doc', 'physical_structure_doc', 'book'], self.doc.type) self.assertEqual(self.metadata, self.doc.metadata) self.assertEqual(self.coords, self.doc.coords) @@ -209,7 +225,8 @@ def test_add_parent_id_to_metadata(self): def test_json(self): expected_json = { 'id': 'doc1', - 'type': ['physical_structure_doc', 'book'], + 'type': ['structure_doc', 'physical_structure_doc', 'book'], + 'main_type': 'book', 'domain': 'physical', 'metadata': {'author': 'Jane Doe'}, 'coords': [(0, 0), (0, 10), (10, 10), (10, 0)] @@ -217,6 +234,41 @@ def test_json(self): self.assertEqual(expected_json, self.doc.json) +class TestPhysicalDocArea(unittest.TestCase): + + def setUp(self) -> None: + points = [(0, 100), (300, 100), (300, 200), (0, 200), (150, 150)] + coords = pdm.Coords(points) + self.doc = pdm.PhysicalStructureDoc(doc_id='doc1', coords=coords) + + def test_doc_has_no_initial_area(self): + self.assertEqual(None, self.doc._area) + + def test_doc_has_area(self): + self.assertEqual(100*300, self.doc.area) + + def test_doc_area_sets_area(self): + area = self.doc.area + self.assertEqual(area, self.doc._area) + + def test_diamoned_shape_has_correct_area(self): + points = [(0, 100), (100, 0), (200, 100), (100, 200)] + coords = pdm.Coords(points) + diamond = pdm.PhysicalStructureDoc(doc_id='doc1', coords=coords) + side = math.sqrt(100**2 + 100**2) + area = side * side + self.assertEqual(area, diamond.area) + + +class TestEmptyRegion(unittest.TestCase): + + def test_create_empty_region(self): + points = [(0, 100), (300, 100), (300, 200), (0, 200), (150, 150)] + coords = pdm.Coords(points) + empty_region = pdm.EmptyRegionDoc(doc_id='empty', coords=coords) + self.assertEqual(300 * 100, empty_region.area) + + class TestLogicalStructureDoc(unittest.TestCase): def setUp(self): self.doc = pdm.LogicalStructureDoc(