Merge pull request #16 from knaw-huc/improve_doc_type

Improve working with region areas, overlap and size
knaw-huc · Mar 18, 2024 · 46f76c2 · 46f76c2
2 parents 091dce9 + ba9de4e
commit 46f76c2
Show file tree

Hide file tree

Showing 9 changed files with 365 additions and 71 deletions.
diff --git a/pagexml/analysis/layout_stats.py b/pagexml/analysis/layout_stats.py
@@ -96,7 +96,7 @@ def interpolate_baseline_points(points: List[Tuple[int, int]],
 
 
 def compute_points_distances(points1: List[Tuple[int, int]], points2: List[Tuple[int, int]],
-                             step: int = 50):
+                             step: int = 50) -> np.ndarray:
     if points1 is None or points2 is None:
         return np.array([])
     b1_points = interpolate_baseline_points(points1, step=step)
@@ -158,12 +158,15 @@ def compute_bounding_box_distances(line1: Union[pdm.PageXMLTextLine, List[pdm.Pa
     return distances
 
 
-def average_baseline_height(line: Union[pdm.PageXMLTextLine, List[pdm.PageXMLTextLine]]) -> int:
+def average_baseline_height(line: Union[pdm.PageXMLTextLine, List[pdm.PageXMLTextLine]],
+                            debug: int = 0) -> int:
     """Compute the average (mean) baseline height for comparing lines that
     are not horizontally aligned.
 
     :param line: a TextLine or a list of adjacent lines
     :type line: PageXMLTextLine
+    :param debug: Boolean to show debug information or not
+    :type debug: bool
     :return: the average (mean) baseline height across all its baseline points
     :rtype: int
     """
@@ -179,7 +182,8 @@ def average_baseline_height(line: Union[pdm.PageXMLTextLine, List[pdm.PageXMLTex
         # segment contributes its average height times its width
         total_avg += segment_avg * abs(next_point[0] - curr_point[0])
     if total_avg < 0:
-        print(f'total_avg: {total_avg}\n')
+        print(f'pagexml.analysis.layout_stats.average_baseline_height - '
+              f'negative total_avg {total_avg} for line {line.id}\n')
 
     # average is total of average heights divided by total width
     x = sorted([point[0] for point in points])
@@ -189,8 +193,9 @@ def average_baseline_height(line: Union[pdm.PageXMLTextLine, List[pdm.PageXMLTex
     else:
         # this should not happen, but if it does, we need to calculate
         # the average differently, to avoid a division by zero error
-        print(f"total_avg={total_avg}")
-        print(f"baseline.points[-1][0]={points[-1][0]}")
+        if debug > 0:
+            print(f"total_avg={total_avg}")
+            print(f"baseline.points[-1][0]={points[-1][0]}")
         xcoords = [p[0] for p in points]
         left_x = min(xcoords)
         right_x = max(xcoords)
@@ -200,7 +205,7 @@ def average_baseline_height(line: Union[pdm.PageXMLTextLine, List[pdm.PageXMLTex
             return int(total_avg)
 
 
-def sort_coords_above_below_baseline(line: pdm.PageXMLTextLine,
+def sort_coords_above_below_baseline(line: pdm.PageXMLTextLine, step: int = 50,
                                      debug: int = 0) -> Tuple[List[Tuple[int, int]],
                                                               List[Tuple[int, int]]]:
     """Split the list of bounding polygon coordinates of a line in sets of points above and below
@@ -209,6 +214,8 @@ def sort_coords_above_below_baseline(line: pdm.PageXMLTextLine,
 
     :param line: a PageXML text line
     :type line: PageXMLTextLine
+    :param step: number of pixels between interpolated points
+    :type step: int
     :param debug: the detail level of debug information (0 = none, higher is more)
     :type debug: int
     :return: two lists of bounding polygon points
@@ -225,12 +232,14 @@ def sort_coords_above_below_baseline(line: pdm.PageXMLTextLine,
         return above_baseline, below_baseline
     if line.coords.left > line.baseline.right:
         return above_baseline, below_baseline
-    interpolated_baseline_points = [i for i in interpolate_baseline_points(line.baseline.points, step=50).items()]
+    interpolated_baseline_points = [i for i in interpolate_baseline_points(line.baseline.points, step=step).items()]
     if debug > 2:
+        print('sort_coords_above_below_baseline - line.id:', line.id)
+        print('sort_coords_above_below_baseline - line.coords.points:', line.coords.points)
         print('baseline_points:', line.baseline.points)
         print('interpolated_baseline_points:', interpolated_baseline_points)
     sorted_coord_points = sorted(line.coords.points, key=lambda p: p[0])
-    if debug > 0:
+    if debug > 1:
         print('sorted_coord_points:', sorted_coord_points)
         print('len(sorted_coord_points):', len(sorted_coord_points))
     if debug > 1:
@@ -240,7 +249,7 @@ def sort_coords_above_below_baseline(line: pdm.PageXMLTextLine,
     for ci_b, curr_b in enumerate(interpolated_baseline_points):
         curr_bx, curr_by = curr_b
         next_b = interpolated_baseline_points[ci_b + 1] if ci_b + 1 < num_baseline_points else None
-        if debug > 0:
+        if debug > 1:
             print(f'sort_above_below - curr_b: {curr_b}')
             print('\tci_c:', ci_c, '\tnum_coord_points:', num_coord_points)
         if ci_c == num_coord_points:
@@ -249,24 +258,29 @@ def sort_coords_above_below_baseline(line: pdm.PageXMLTextLine,
             curr_cx, curr_cy = curr_c
             if next_b and abs(next_b[0] - curr_cx) < abs(curr_b[0] - curr_cx):
                 break
-            if debug > 0:
+            if debug > 1:
                 print(f'sort_above_below - curr_c ({ci_c}): {curr_c}')
             ci_c += 1
             if curr_cy < curr_by:
-                if debug > 0:
+                if debug > 1:
                     print(f'sort_above_below - above')
                 above_baseline.append(curr_c)
             else:
-                if debug > 0:
+                if debug > 1:
                     print(f'sort_above_below - below')
                 below_baseline.append(curr_c)
 
+    if debug > 2:
+        print('sort_coords_above_below_baseline - above_baseline:', above_baseline)
+        print('sort_coords_above_below_baseline - below_baseline:', below_baseline)
     return above_baseline, below_baseline
 
 
 def get_text_heights(line: pdm.PageXMLTextLine, step: int = 50,
                      ignore_errors: bool = True, debug: int = 0) -> np.array:
-    above_baseline, below_baseline = sort_coords_above_below_baseline(line, debug=debug)
+    if line.baseline.width <= step:
+        step = 5
+    above_baseline, below_baseline = sort_coords_above_below_baseline(line, step=step, debug=debug)
     if len(above_baseline) == 0:
         if ignore_errors is False:
             ValueError(f'line {line.id} has no bounding coordinates above baseline')
@@ -276,6 +290,10 @@ def get_text_heights(line: pdm.PageXMLTextLine, step: int = 50,
             ValueError(f'Warning: line {line.id} has no bounding coordinates below baseline')
     int_base = interpolate_baseline_points(line.baseline.points, step=step)
     int_above = interpolate_baseline_points(above_baseline, step=step)
+    if debug > 1:
+        print('get_text_heights - line.id:', line.id)
+        print('get_text_heights - int_base:', int_base)
+        print('get_text_heights - int_above:', int_above)
 
     height = {}
     for x in int_base:
@@ -327,7 +345,7 @@ def get_line_distances(lines: List[pdm.PageXMLTextLine]) -> List[np.ndarray]:
             else:
                 distances = compute_bounding_box_distances(curr_line, next_line)
             all_distances.append(distances)
-        return all_distances
+    return all_distances
 
 
 def get_textregion_line_distances(text_region: pdm.PageXMLTextRegion) -> List[np.ndarray]:
@@ -593,7 +611,7 @@ def get_line_widths(pagexml_files: List[Union[str, pdm.PageXMLTextRegion]] = Non
 
 
 def find_line_width_boundary_points(line_widths: List[int], line_bin_size: int = 50,
-                                    min_ratio: float = 0.25) -> List[int]:
+                                    min_ratio: float = 0.25, debug: int = 0) -> List[int]:
     """Find the minima in the distribution of line widths relative to the peaks in the distribution.
     These minima represent the boundaries between clusters of lines within the same line width
     intervals.
@@ -619,29 +637,37 @@ def find_line_width_boundary_points(line_widths: List[int], line_bin_size: int =
     curr_max_width = None
     curr_min_width = None
     prev_freq = 0
+    if debug > 0:
+        print(f"find_line_width_boundary_points - total_widths: {total_widths}")
+        print(f"find_line_width_boundary_points - max_width: {max_width}")
+        print(f"find_line_width_boundary_points - max_freq: {max_freq}")
 
     for w in range(0, max_width + 1, line_bin_size):
         f = width_freq[w]
         if f > curr_max_freq:
-            # print(f'\tfreq {f} bigger than curr max: {curr_max_freq}')
+            if debug > 0:
+                print(f'\tfreq {f} bigger than curr max: {curr_max_freq}')
             curr_max_freq = f
             curr_max_width = w
         if f < prev_freq and f < curr_min_freq:
-            # print(f'\twidth: {w}\tfreq {f} smaller than prev freq: {prev_freq} and than curr min {curr_min_freq}')
+            if debug > 0:
+                print(f'\twidth: {w}\tfreq {f} smaller than prev freq: {prev_freq} and than curr min {curr_min_freq}')
             curr_min_freq = f
             curr_min_width = w
         if f / num_lines > 0.01 and f > prev_freq and f > curr_min_freq:
-            # print(f'\twidth: {w}\tfreq {f} bigger than prev freq: {prev_freq} and than curr min {curr_min_freq}')
-            # if prev_freq > 0 and f / prev_freq > 1.2 and (curr_max_freq - curr_min_freq) / curr_max_freq > min_ratio:
-            # print('\t\tRatio:', (curr_max_freq - curr_min_freq) / curr_max_freq)
+            if debug > 0:
+                print(f'\twidth: {w}\tfreq {f} bigger than prev freq: {prev_freq} and than curr min {curr_min_freq}')
+                # if prev_freq > 0 and f / prev_freq > 1.2 and (curr_max_freq - curr_min_freq) / curr_max_freq > min_ratio:
+                print('\t\tRatio:', (curr_max_freq - curr_min_freq) / curr_max_freq)
             if (curr_max_freq - curr_min_freq) / curr_max_freq > min_ratio:
                 boundary_points.append((curr_min_width, curr_min_freq))
                 curr_max_freq = 0
                 curr_max_width = 0
                 curr_min_freq = max_freq + 1
-        # print(f"width: {w: >5}\tfreq: {f: >8}\tprev_freq: {prev_freq: >8}"
-        #       f"\tcurr_min_freq: {curr_min_freq: >8}"
-        #       f"\tcurr_max_freq: {curr_max_freq}\tboundary points: {boundary_points}")
+        if debug > 0:
+            print(f"width: {w: >5}\tfreq: {f: >8}\tprev_freq: {prev_freq: >8}"
+                  f"\tcurr_min_freq: {curr_min_freq: >8}"
+                  f"\tcurr_max_freq: {curr_max_freq}\tboundary points: {boundary_points}")
         prev_freq = f
     return [bp[0] for bp in boundary_points]
 

diff --git a/pagexml/analysis/stats.py b/pagexml/analysis/stats.py
@@ -7,6 +7,9 @@
 import pagexml.model.physical_document_model as pdm
 
 
+DEFAULT_ELEMENTS = ['lines', 'words', 'text_regions', 'columns', 'extra', 'pages']
+
+
 def derive_boundary_points(pagexml_doc: pdm.PageXMLTextRegion) -> List[int]:
     bin_width = pagexml_doc.coords.width / 5
     return [point for point in np.arange(bin_width, pagexml_doc.coords.width, bin_width)]
@@ -17,11 +20,14 @@ def _init_doc_stats(line_width_boundary_points: List[int],
     fields = ['doc_id', 'doc_num', 'doc_width', 'doc_height',
               'lines', 'words', 'text_regions',
               'columns', 'extra', 'pages',
-              'num_words', 'num_number_words', 'num_title_words', 'num_non_title_words',
+              'num_words', 'num_alpha_words', 'num_number_words',
+              'num_title_words', 'num_non_title_words',
               'num_stop_words', 'num_punctuation_words', 'num_oversized_words']
     doc_stats = {field: [] for field in fields}
     for cat_wpl in text_stats.wpl_cat_range:
         doc_stats[f"words_per_line_{text_stats.wpl_cat_range[cat_wpl]}"] = []
+    for cat_wpl in text_stats.wpl_cat_range:
+        doc_stats[f"alpha_words_per_line_{text_stats.wpl_cat_range[cat_wpl]}"] = []
     for length_bin in range(word_length_bin_size, max_word_length + 1, word_length_bin_size):
         doc_stats[f"num_words_length_{length_bin}"] = []
     for width_range in layout_stats.get_boundary_width_ranges(line_width_boundary_points):
@@ -70,19 +76,23 @@ def get_doc_stats(pagexml_docs: Union[pdm.PageXMLTextRegion, List[pdm.PageXMLTex
     for pi, pagexml_doc in enumerate(pagexml_docs):
         pagexml_doc_stats['doc_id'].append(pagexml_doc.id)
         pagexml_doc_stats['doc_num'].append(pi + 1)
-        pagexml_doc_stats['doc_width'].append(pagexml_doc.coords.width)
-        pagexml_doc_stats['doc_height'].append(pagexml_doc.coords.height)
+        pagexml_doc_stats['doc_width'].append(pagexml_doc.coords.width if pagexml_doc.coords else None)
+        pagexml_doc_stats['doc_height'].append(pagexml_doc.coords.height if pagexml_doc.coords else None)
         lines = [line for line in pagexml_doc.get_lines() if line.text is not None]
         words = text_stats.get_doc_words(pagexml_doc, use_re_word_boundaries=use_re_word_boundaries)
         word_stats = text_stats.get_word_cat_stats(words, stop_words=stop_words,
                                                    max_word_length=max_word_length)
-        wpl_stats = text_stats.get_words_per_line(lines)
-        for field in pagexml_doc.stats:
-            pagexml_doc_stats[field].append(pagexml_doc.stats[field])
+        wpl_stats = text_stats.get_words_per_line(lines, alpha_words_only=False)
+        awpl_stats = text_stats.get_words_per_line(lines, alpha_words_only=True)
+        # for field in pagexml_doc.stats:
+        for field in DEFAULT_ELEMENTS:
+            pagexml_doc_stats[field].append(pagexml_doc.stats[field] if field in pagexml_doc.stats else 0)
         for word_cat in word_stats:
             pagexml_doc_stats[word_cat].append((word_stats[word_cat]))
         for wpl_cat in text_stats.wpl_cat_range.values():
             pagexml_doc_stats[f'words_per_line_{wpl_cat}'].append(wpl_stats[wpl_cat])
+        for wpl_cat in text_stats.wpl_cat_range.values():
+            pagexml_doc_stats[f'alpha_words_per_line_{wpl_cat}'].append(awpl_stats[wpl_cat])
         if line_width_boundary_points is None:
             bin_width = pagexml_doc.coords.width / 5
             line_width_boundary_points = [point for point in np.arange(bin_width, pagexml_doc.coords.width, bin_width)]

diff --git a/pagexml/analysis/text_stats.py b/pagexml/analysis/text_stats.py
@@ -927,13 +927,16 @@ def get_typical_start_end_words(wbd: WordBreakDetector,
     return typical_start_words, typical_end_words
 
 
-def get_words_per_line(lines: List[pdm.PageXMLTextLine], use_re_word_boundaries: bool = False):
+def get_words_per_line(lines: List[pdm.PageXMLTextLine], use_re_word_boundaries: bool = False,
+                       alpha_words_only: bool = False):
     """Return a Counter of the number of words per line of a PageXML pagexml_doc object.
 
     :param lines: a list of PageXMLTextLine objects
     :type lines: List[PageXMLTextLine]
     :param use_re_word_boundaries: whether to split words of a line using RegEx word boundaries
     :type use_re_word_boundaries: bool
+    :param alpha_words_only: whether to only count words consisting of alpha characters (e.g. no numbers)
+    :type alpha_words_only: bool
     :return: a counter of the number of words per line of a pagexml_doc
     :rtype: Counter
     """
@@ -944,9 +947,12 @@ def get_words_per_line(lines: List[pdm.PageXMLTextLine], use_re_word_boundaries:
         if line.text is None or line.text == '':
             words = []
         elif use_re_word_boundaries:
-            words = [w.replace(' ', '') for w in re.split(r'\b', line.text) if w != ' ' and w != '']
+            words = [w.replace(' ', '') for w in re.split(r'\b', line.text)]
         else:
             words = [w for w in line.text.split(' ')]
+        words = [w for w in words if w != ' ' and w != '']
+        if alpha_words_only is True:
+            words = [w for w in words if w.isalpha()]
         # words_per_line.update([len(words)])
         if len(words) in wpl_to_cat:
             wpl_cat = wpl_to_cat[len(words)]
@@ -991,6 +997,7 @@ def get_word_cat_stats(words, stop_words=None, max_word_length: int = 30,
     word_length_freq = Counter([len(w) for w in words if len(w) <= max_word_length])
     word_cat_stats = {
         'num_words': len(words),
+        'num_alpha_words': len([w for w in words if w.isalpha()]),
         'num_number_words': len([w for w in words if w.isdigit()]),
         'num_title_words': len([w for w in words if w.istitle()]),
         'num_non_title_words': len([w for w in words if w.istitle() is False]),

diff --git a/pagexml/column_parser.py b/pagexml/column_parser.py
@@ -79,7 +79,7 @@ def column_bounding_box_surrounds_lines(column: pdm.PageXMLColumn) -> bool:
     """Check if the column coordinates contain the coordinate
     boxes of the column lines."""
     for line in column.get_lines():
-        if not pagexml_helper.elements_overlap(column, line, threshold=0.6):
+        if not pagexml_helper.regions_overlap(column, line, threshold=0.6):
             return False
     return True
 

diff --git a/pagexml/helper/pagexml_helper.py b/pagexml/helper/pagexml_helper.py
@@ -3,6 +3,7 @@
 import re
 import string
 from collections import Counter
+from enum import Enum
 from typing import Dict, Generator, List, Set, Tuple, Union
 
 import numpy as np
@@ -13,22 +14,65 @@
 import pagexml.model.physical_document_model as pdm
 
 
-def elements_overlap(element1: pdm.PageXMLDoc, element2: pdm.PageXMLDoc,
-                     threshold: float = 0.5) -> bool:
-    """Check if two elements have overlapping coordinates."""
-    v_overlap = pdm.get_vertical_overlap(element1, element2)
-    h_overlap = pdm.get_horizontal_overlap(element1, element2)
-    if v_overlap / element1.coords.height > threshold:
-        if h_overlap / element1.coords.width > threshold:
-            return True
-    if v_overlap / element2.coords.height > threshold:
-        if h_overlap / element2.coords.width > threshold:
-            return True
+def is_point_inside(point: Tuple[int, int], element: pdm.PageXMLDoc) -> bool:
+    x, y = point
+    if x < element.coords.left or x > element.coords.right:
+        return False
+    if y < element.coords.top or y > element.coords.bottom:
+        return False
+    return True
+
+
+class RegionType(Enum):
+
+    POINT = 1
+    HLINE = 2
+    VLINE = 3
+    BOX = 4
+
+
+def get_region_type(element: pdm.PageXMLDoc) -> RegionType:
+    if element.coords.height == 0:
+        if element.coords.width == 0:
+            return RegionType.POINT
         else:
-            return False
+            return RegionType.HLINE
+    elif element.coords.width == 0:
+        return RegionType.VLINE
     else:
+        return RegionType.BOX
+
+
+def same_point(point1: Tuple[int, int], point2: Tuple[int, int]) -> bool:
+    """Check if two points are the same."""
+    return point1[0] == point2[0] and point1[1] == point2[1]
+
+
+def regions_overlap(region1: pdm.PageXMLDoc, region2: pdm.PageXMLDoc,
+                    threshold: float = 0.5) -> bool:
+    """Check if two regions have overlapping coordinates.
+
+    Assumption: points are pixels, so regions with at least one point have at least
+    a width, height and area of 1."""
+    if region1.coords is None or region2.coords is None:
         return False
 
+    height1 = region1.coords.height + 1
+    width1 = region1.coords.width + 1
+    height2 = region2.coords.height + 1
+    width2 = region2.coords.width + 1
+
+    v_overlap = pdm.get_vertical_overlap(region1, region2)
+    h_overlap = pdm.get_horizontal_overlap(region1, region2)
+
+    if v_overlap / height1 > threshold:
+        if h_overlap / width1 > threshold:
+            return True
+    if v_overlap / height2 > threshold:
+        if h_overlap / width2 > threshold:
+            return True
+    return False
+
 
 def sort_regions_in_reading_order(doc: pdm.PageXMLDoc) -> List[pdm.PageXMLTextRegion]:
     """Sort text regions in reading order. If an explicit reading order is given,