Skip to content

Commit

Permalink
Merge pull request #16 from knaw-huc/improve_doc_type
Browse files Browse the repository at this point in the history
Improve working with region areas, overlap and size
  • Loading branch information
brambg committed Mar 18, 2024
2 parents 091dce9 + ba9de4e commit 46f76c2
Show file tree
Hide file tree
Showing 9 changed files with 365 additions and 71 deletions.
72 changes: 49 additions & 23 deletions pagexml/analysis/layout_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def interpolate_baseline_points(points: List[Tuple[int, int]],


def compute_points_distances(points1: List[Tuple[int, int]], points2: List[Tuple[int, int]],
step: int = 50):
step: int = 50) -> np.ndarray:
if points1 is None or points2 is None:
return np.array([])
b1_points = interpolate_baseline_points(points1, step=step)
Expand Down Expand Up @@ -158,12 +158,15 @@ def compute_bounding_box_distances(line1: Union[pdm.PageXMLTextLine, List[pdm.Pa
return distances


def average_baseline_height(line: Union[pdm.PageXMLTextLine, List[pdm.PageXMLTextLine]]) -> int:
def average_baseline_height(line: Union[pdm.PageXMLTextLine, List[pdm.PageXMLTextLine]],
debug: int = 0) -> int:
"""Compute the average (mean) baseline height for comparing lines that
are not horizontally aligned.
:param line: a TextLine or a list of adjacent lines
:type line: PageXMLTextLine
:param debug: Boolean to show debug information or not
:type debug: bool
:return: the average (mean) baseline height across all its baseline points
:rtype: int
"""
Expand All @@ -179,7 +182,8 @@ def average_baseline_height(line: Union[pdm.PageXMLTextLine, List[pdm.PageXMLTex
# segment contributes its average height times its width
total_avg += segment_avg * abs(next_point[0] - curr_point[0])
if total_avg < 0:
print(f'total_avg: {total_avg}\n')
print(f'pagexml.analysis.layout_stats.average_baseline_height - '
f'negative total_avg {total_avg} for line {line.id}\n')

# average is total of average heights divided by total width
x = sorted([point[0] for point in points])
Expand All @@ -189,8 +193,9 @@ def average_baseline_height(line: Union[pdm.PageXMLTextLine, List[pdm.PageXMLTex
else:
# this should not happen, but if it does, we need to calculate
# the average differently, to avoid a division by zero error
print(f"total_avg={total_avg}")
print(f"baseline.points[-1][0]={points[-1][0]}")
if debug > 0:
print(f"total_avg={total_avg}")
print(f"baseline.points[-1][0]={points[-1][0]}")
xcoords = [p[0] for p in points]
left_x = min(xcoords)
right_x = max(xcoords)
Expand All @@ -200,7 +205,7 @@ def average_baseline_height(line: Union[pdm.PageXMLTextLine, List[pdm.PageXMLTex
return int(total_avg)


def sort_coords_above_below_baseline(line: pdm.PageXMLTextLine,
def sort_coords_above_below_baseline(line: pdm.PageXMLTextLine, step: int = 50,
debug: int = 0) -> Tuple[List[Tuple[int, int]],
List[Tuple[int, int]]]:
"""Split the list of bounding polygon coordinates of a line in sets of points above and below
Expand All @@ -209,6 +214,8 @@ def sort_coords_above_below_baseline(line: pdm.PageXMLTextLine,
:param line: a PageXML text line
:type line: PageXMLTextLine
:param step: number of pixels between interpolated points
:type step: int
:param debug: the detail level of debug information (0 = none, higher is more)
:type debug: int
:return: two lists of bounding polygon points
Expand All @@ -225,12 +232,14 @@ def sort_coords_above_below_baseline(line: pdm.PageXMLTextLine,
return above_baseline, below_baseline
if line.coords.left > line.baseline.right:
return above_baseline, below_baseline
interpolated_baseline_points = [i for i in interpolate_baseline_points(line.baseline.points, step=50).items()]
interpolated_baseline_points = [i for i in interpolate_baseline_points(line.baseline.points, step=step).items()]
if debug > 2:
print('sort_coords_above_below_baseline - line.id:', line.id)
print('sort_coords_above_below_baseline - line.coords.points:', line.coords.points)
print('baseline_points:', line.baseline.points)
print('interpolated_baseline_points:', interpolated_baseline_points)
sorted_coord_points = sorted(line.coords.points, key=lambda p: p[0])
if debug > 0:
if debug > 1:
print('sorted_coord_points:', sorted_coord_points)
print('len(sorted_coord_points):', len(sorted_coord_points))
if debug > 1:
Expand All @@ -240,7 +249,7 @@ def sort_coords_above_below_baseline(line: pdm.PageXMLTextLine,
for ci_b, curr_b in enumerate(interpolated_baseline_points):
curr_bx, curr_by = curr_b
next_b = interpolated_baseline_points[ci_b + 1] if ci_b + 1 < num_baseline_points else None
if debug > 0:
if debug > 1:
print(f'sort_above_below - curr_b: {curr_b}')
print('\tci_c:', ci_c, '\tnum_coord_points:', num_coord_points)
if ci_c == num_coord_points:
Expand All @@ -249,24 +258,29 @@ def sort_coords_above_below_baseline(line: pdm.PageXMLTextLine,
curr_cx, curr_cy = curr_c
if next_b and abs(next_b[0] - curr_cx) < abs(curr_b[0] - curr_cx):
break
if debug > 0:
if debug > 1:
print(f'sort_above_below - curr_c ({ci_c}): {curr_c}')
ci_c += 1
if curr_cy < curr_by:
if debug > 0:
if debug > 1:
print(f'sort_above_below - above')
above_baseline.append(curr_c)
else:
if debug > 0:
if debug > 1:
print(f'sort_above_below - below')
below_baseline.append(curr_c)

if debug > 2:
print('sort_coords_above_below_baseline - above_baseline:', above_baseline)
print('sort_coords_above_below_baseline - below_baseline:', below_baseline)
return above_baseline, below_baseline


def get_text_heights(line: pdm.PageXMLTextLine, step: int = 50,
ignore_errors: bool = True, debug: int = 0) -> np.array:
above_baseline, below_baseline = sort_coords_above_below_baseline(line, debug=debug)
if line.baseline.width <= step:
step = 5
above_baseline, below_baseline = sort_coords_above_below_baseline(line, step=step, debug=debug)
if len(above_baseline) == 0:
if ignore_errors is False:
ValueError(f'line {line.id} has no bounding coordinates above baseline')
Expand All @@ -276,6 +290,10 @@ def get_text_heights(line: pdm.PageXMLTextLine, step: int = 50,
ValueError(f'Warning: line {line.id} has no bounding coordinates below baseline')
int_base = interpolate_baseline_points(line.baseline.points, step=step)
int_above = interpolate_baseline_points(above_baseline, step=step)
if debug > 1:
print('get_text_heights - line.id:', line.id)
print('get_text_heights - int_base:', int_base)
print('get_text_heights - int_above:', int_above)

height = {}
for x in int_base:
Expand Down Expand Up @@ -327,7 +345,7 @@ def get_line_distances(lines: List[pdm.PageXMLTextLine]) -> List[np.ndarray]:
else:
distances = compute_bounding_box_distances(curr_line, next_line)
all_distances.append(distances)
return all_distances
return all_distances


def get_textregion_line_distances(text_region: pdm.PageXMLTextRegion) -> List[np.ndarray]:
Expand Down Expand Up @@ -593,7 +611,7 @@ def get_line_widths(pagexml_files: List[Union[str, pdm.PageXMLTextRegion]] = Non


def find_line_width_boundary_points(line_widths: List[int], line_bin_size: int = 50,
min_ratio: float = 0.25) -> List[int]:
min_ratio: float = 0.25, debug: int = 0) -> List[int]:
"""Find the minima in the distribution of line widths relative to the peaks in the distribution.
These minima represent the boundaries between clusters of lines within the same line width
intervals.
Expand All @@ -619,29 +637,37 @@ def find_line_width_boundary_points(line_widths: List[int], line_bin_size: int =
curr_max_width = None
curr_min_width = None
prev_freq = 0
if debug > 0:
print(f"find_line_width_boundary_points - total_widths: {total_widths}")
print(f"find_line_width_boundary_points - max_width: {max_width}")
print(f"find_line_width_boundary_points - max_freq: {max_freq}")

for w in range(0, max_width + 1, line_bin_size):
f = width_freq[w]
if f > curr_max_freq:
# print(f'\tfreq {f} bigger than curr max: {curr_max_freq}')
if debug > 0:
print(f'\tfreq {f} bigger than curr max: {curr_max_freq}')
curr_max_freq = f
curr_max_width = w
if f < prev_freq and f < curr_min_freq:
# print(f'\twidth: {w}\tfreq {f} smaller than prev freq: {prev_freq} and than curr min {curr_min_freq}')
if debug > 0:
print(f'\twidth: {w}\tfreq {f} smaller than prev freq: {prev_freq} and than curr min {curr_min_freq}')
curr_min_freq = f
curr_min_width = w
if f / num_lines > 0.01 and f > prev_freq and f > curr_min_freq:
# print(f'\twidth: {w}\tfreq {f} bigger than prev freq: {prev_freq} and than curr min {curr_min_freq}')
# if prev_freq > 0 and f / prev_freq > 1.2 and (curr_max_freq - curr_min_freq) / curr_max_freq > min_ratio:
# print('\t\tRatio:', (curr_max_freq - curr_min_freq) / curr_max_freq)
if debug > 0:
print(f'\twidth: {w}\tfreq {f} bigger than prev freq: {prev_freq} and than curr min {curr_min_freq}')
# if prev_freq > 0 and f / prev_freq > 1.2 and (curr_max_freq - curr_min_freq) / curr_max_freq > min_ratio:
print('\t\tRatio:', (curr_max_freq - curr_min_freq) / curr_max_freq)
if (curr_max_freq - curr_min_freq) / curr_max_freq > min_ratio:
boundary_points.append((curr_min_width, curr_min_freq))
curr_max_freq = 0
curr_max_width = 0
curr_min_freq = max_freq + 1
# print(f"width: {w: >5}\tfreq: {f: >8}\tprev_freq: {prev_freq: >8}"
# f"\tcurr_min_freq: {curr_min_freq: >8}"
# f"\tcurr_max_freq: {curr_max_freq}\tboundary points: {boundary_points}")
if debug > 0:
print(f"width: {w: >5}\tfreq: {f: >8}\tprev_freq: {prev_freq: >8}"
f"\tcurr_min_freq: {curr_min_freq: >8}"
f"\tcurr_max_freq: {curr_max_freq}\tboundary points: {boundary_points}")
prev_freq = f
return [bp[0] for bp in boundary_points]

Expand Down
22 changes: 16 additions & 6 deletions pagexml/analysis/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
import pagexml.model.physical_document_model as pdm


DEFAULT_ELEMENTS = ['lines', 'words', 'text_regions', 'columns', 'extra', 'pages']


def derive_boundary_points(pagexml_doc: pdm.PageXMLTextRegion) -> List[int]:
bin_width = pagexml_doc.coords.width / 5
return [point for point in np.arange(bin_width, pagexml_doc.coords.width, bin_width)]
Expand All @@ -17,11 +20,14 @@ def _init_doc_stats(line_width_boundary_points: List[int],
fields = ['doc_id', 'doc_num', 'doc_width', 'doc_height',
'lines', 'words', 'text_regions',
'columns', 'extra', 'pages',
'num_words', 'num_number_words', 'num_title_words', 'num_non_title_words',
'num_words', 'num_alpha_words', 'num_number_words',
'num_title_words', 'num_non_title_words',
'num_stop_words', 'num_punctuation_words', 'num_oversized_words']
doc_stats = {field: [] for field in fields}
for cat_wpl in text_stats.wpl_cat_range:
doc_stats[f"words_per_line_{text_stats.wpl_cat_range[cat_wpl]}"] = []
for cat_wpl in text_stats.wpl_cat_range:
doc_stats[f"alpha_words_per_line_{text_stats.wpl_cat_range[cat_wpl]}"] = []
for length_bin in range(word_length_bin_size, max_word_length + 1, word_length_bin_size):
doc_stats[f"num_words_length_{length_bin}"] = []
for width_range in layout_stats.get_boundary_width_ranges(line_width_boundary_points):
Expand Down Expand Up @@ -70,19 +76,23 @@ def get_doc_stats(pagexml_docs: Union[pdm.PageXMLTextRegion, List[pdm.PageXMLTex
for pi, pagexml_doc in enumerate(pagexml_docs):
pagexml_doc_stats['doc_id'].append(pagexml_doc.id)
pagexml_doc_stats['doc_num'].append(pi + 1)
pagexml_doc_stats['doc_width'].append(pagexml_doc.coords.width)
pagexml_doc_stats['doc_height'].append(pagexml_doc.coords.height)
pagexml_doc_stats['doc_width'].append(pagexml_doc.coords.width if pagexml_doc.coords else None)
pagexml_doc_stats['doc_height'].append(pagexml_doc.coords.height if pagexml_doc.coords else None)
lines = [line for line in pagexml_doc.get_lines() if line.text is not None]
words = text_stats.get_doc_words(pagexml_doc, use_re_word_boundaries=use_re_word_boundaries)
word_stats = text_stats.get_word_cat_stats(words, stop_words=stop_words,
max_word_length=max_word_length)
wpl_stats = text_stats.get_words_per_line(lines)
for field in pagexml_doc.stats:
pagexml_doc_stats[field].append(pagexml_doc.stats[field])
wpl_stats = text_stats.get_words_per_line(lines, alpha_words_only=False)
awpl_stats = text_stats.get_words_per_line(lines, alpha_words_only=True)
# for field in pagexml_doc.stats:
for field in DEFAULT_ELEMENTS:
pagexml_doc_stats[field].append(pagexml_doc.stats[field] if field in pagexml_doc.stats else 0)
for word_cat in word_stats:
pagexml_doc_stats[word_cat].append((word_stats[word_cat]))
for wpl_cat in text_stats.wpl_cat_range.values():
pagexml_doc_stats[f'words_per_line_{wpl_cat}'].append(wpl_stats[wpl_cat])
for wpl_cat in text_stats.wpl_cat_range.values():
pagexml_doc_stats[f'alpha_words_per_line_{wpl_cat}'].append(awpl_stats[wpl_cat])
if line_width_boundary_points is None:
bin_width = pagexml_doc.coords.width / 5
line_width_boundary_points = [point for point in np.arange(bin_width, pagexml_doc.coords.width, bin_width)]
Expand Down
11 changes: 9 additions & 2 deletions pagexml/analysis/text_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -927,13 +927,16 @@ def get_typical_start_end_words(wbd: WordBreakDetector,
return typical_start_words, typical_end_words


def get_words_per_line(lines: List[pdm.PageXMLTextLine], use_re_word_boundaries: bool = False):
def get_words_per_line(lines: List[pdm.PageXMLTextLine], use_re_word_boundaries: bool = False,
alpha_words_only: bool = False):
"""Return a Counter of the number of words per line of a PageXML pagexml_doc object.
:param lines: a list of PageXMLTextLine objects
:type lines: List[PageXMLTextLine]
:param use_re_word_boundaries: whether to split words of a line using RegEx word boundaries
:type use_re_word_boundaries: bool
:param alpha_words_only: whether to only count words consisting of alpha characters (e.g. no numbers)
:type alpha_words_only: bool
:return: a counter of the number of words per line of a pagexml_doc
:rtype: Counter
"""
Expand All @@ -944,9 +947,12 @@ def get_words_per_line(lines: List[pdm.PageXMLTextLine], use_re_word_boundaries:
if line.text is None or line.text == '':
words = []
elif use_re_word_boundaries:
words = [w.replace(' ', '') for w in re.split(r'\b', line.text) if w != ' ' and w != '']
words = [w.replace(' ', '') for w in re.split(r'\b', line.text)]
else:
words = [w for w in line.text.split(' ')]
words = [w for w in words if w != ' ' and w != '']
if alpha_words_only is True:
words = [w for w in words if w.isalpha()]
# words_per_line.update([len(words)])
if len(words) in wpl_to_cat:
wpl_cat = wpl_to_cat[len(words)]
Expand Down Expand Up @@ -991,6 +997,7 @@ def get_word_cat_stats(words, stop_words=None, max_word_length: int = 30,
word_length_freq = Counter([len(w) for w in words if len(w) <= max_word_length])
word_cat_stats = {
'num_words': len(words),
'num_alpha_words': len([w for w in words if w.isalpha()]),
'num_number_words': len([w for w in words if w.isdigit()]),
'num_title_words': len([w for w in words if w.istitle()]),
'num_non_title_words': len([w for w in words if w.istitle() is False]),
Expand Down
2 changes: 1 addition & 1 deletion pagexml/column_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def column_bounding_box_surrounds_lines(column: pdm.PageXMLColumn) -> bool:
"""Check if the column coordinates contain the coordinate
boxes of the column lines."""
for line in column.get_lines():
if not pagexml_helper.elements_overlap(column, line, threshold=0.6):
if not pagexml_helper.regions_overlap(column, line, threshold=0.6):
return False
return True

Expand Down
68 changes: 56 additions & 12 deletions pagexml/helper/pagexml_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import re
import string
from collections import Counter
from enum import Enum
from typing import Dict, Generator, List, Set, Tuple, Union

import numpy as np
Expand All @@ -13,22 +14,65 @@
import pagexml.model.physical_document_model as pdm


def elements_overlap(element1: pdm.PageXMLDoc, element2: pdm.PageXMLDoc,
threshold: float = 0.5) -> bool:
"""Check if two elements have overlapping coordinates."""
v_overlap = pdm.get_vertical_overlap(element1, element2)
h_overlap = pdm.get_horizontal_overlap(element1, element2)
if v_overlap / element1.coords.height > threshold:
if h_overlap / element1.coords.width > threshold:
return True
if v_overlap / element2.coords.height > threshold:
if h_overlap / element2.coords.width > threshold:
return True
def is_point_inside(point: Tuple[int, int], element: pdm.PageXMLDoc) -> bool:
x, y = point
if x < element.coords.left or x > element.coords.right:
return False
if y < element.coords.top or y > element.coords.bottom:
return False
return True


class RegionType(Enum):

POINT = 1
HLINE = 2
VLINE = 3
BOX = 4


def get_region_type(element: pdm.PageXMLDoc) -> RegionType:
if element.coords.height == 0:
if element.coords.width == 0:
return RegionType.POINT
else:
return False
return RegionType.HLINE
elif element.coords.width == 0:
return RegionType.VLINE
else:
return RegionType.BOX


def same_point(point1: Tuple[int, int], point2: Tuple[int, int]) -> bool:
"""Check if two points are the same."""
return point1[0] == point2[0] and point1[1] == point2[1]


def regions_overlap(region1: pdm.PageXMLDoc, region2: pdm.PageXMLDoc,
threshold: float = 0.5) -> bool:
"""Check if two regions have overlapping coordinates.
Assumption: points are pixels, so regions with at least one point have at least
a width, height and area of 1."""
if region1.coords is None or region2.coords is None:
return False

height1 = region1.coords.height + 1
width1 = region1.coords.width + 1
height2 = region2.coords.height + 1
width2 = region2.coords.width + 1

v_overlap = pdm.get_vertical_overlap(region1, region2)
h_overlap = pdm.get_horizontal_overlap(region1, region2)

if v_overlap / height1 > threshold:
if h_overlap / width1 > threshold:
return True
if v_overlap / height2 > threshold:
if h_overlap / width2 > threshold:
return True
return False


def sort_regions_in_reading_order(doc: pdm.PageXMLDoc) -> List[pdm.PageXMLTextRegion]:
"""Sort text regions in reading order. If an explicit reading order is given,
Expand Down
Loading

0 comments on commit 46f76c2

Please sign in to comment.