diff --git a/borb/toolkit/table/table_detection_by_lines.py b/borb/toolkit/table/table_detection_by_lines.py index 80d3f0acd..73148427b 100644 --- a/borb/toolkit/table/table_detection_by_lines.py +++ b/borb/toolkit/table/table_detection_by_lines.py @@ -52,23 +52,46 @@ def __init__(self): # # PRIVATE # + def _determine_sorted_lines_end_points( + self, lines_in_table: typing.List[LineSegment] + ) -> typing.Tuple[typing.List[Decimal], typing.List[Decimal]]: + # take out of all xs / ys + whole_xs: typing.List[Decimal] = [] + whole_ys: typing.List[Decimal] = [] + + for l in lines_in_table: + whole_xs.append(Decimal(l.x0)) + whole_xs.append(Decimal(l.x1)) + whole_ys.append(Decimal(l.y0)) + whole_ys.append(Decimal(l.y1)) + + min_dist = Decimal(1) + # filter xs / ys based on distance + filtered_xs: typing.List[Decimal] = [] + filtered_ys: typing.List[Decimal] = [] + + for x in sorted(whole_xs): + if not filtered_xs or x - filtered_xs[-1] > min_dist: + filtered_xs.append(x) + + for y in sorted(whole_ys): + if not filtered_ys or y - filtered_ys[-1] > min_dist: + filtered_ys.append(y) + + return filtered_xs, filtered_ys + # + # PRIVATE + # def _determine_number_of_rows_and_columns( self, lines_in_table: typing.List[LineSegment] ) -> typing.Tuple[int, int]: - # keep track of unique xs / ys (to derive number of rows/cols) - unique_xs: typing.Set[int] = set() - unique_ys: typing.Set[int] = set() - - for l in lines_in_table: - unique_xs.add(int(l.x0)) - unique_xs.add(int(l.x1)) - unique_ys.add(int(l.y0)) - unique_ys.add(int(l.y1)) + # determine the end point of the lines + xs, ys = self._determine_sorted_lines_end_points(lines_in_table) # determine number of rows/cols - number_of_rows: int = len(unique_ys) - 1 - number_of_cols: int = len(unique_xs) - 1 + number_of_rows: int = len(xs) - 1 + number_of_cols: int = len(ys) - 1 # return return number_of_rows, number_of_cols @@ -93,23 +116,14 @@ def _determine_table_bounding_box( def _determine_table_cell_boundaries( self, lines_in_table: typing.List[LineSegment] ) -> Table: - # keep track of unique xs / ys (to derive number of rows/cols) - unique_xs: typing.Set[int] = set() - unique_ys: typing.Set[int] = set() - - for l in lines_in_table: - unique_xs.add(int(l.x0)) - unique_xs.add(int(l.x1)) - unique_ys.add(int(l.y0)) - unique_ys.add(int(l.y1)) + # determine the end points of the lines + xs: typing.List[Decimal] + ys: typing.List[Decimal] + xs, ys = self._determine_sorted_lines_end_points(lines_in_table) # determine number of rows and cols - number_of_rows: int = len(unique_ys) - 1 - number_of_cols: int = len(unique_xs) - 1 - - # sort unique_xs and unique_ys - xs: typing.List[Decimal] = sorted([Decimal(x) for x in unique_xs]) - ys: typing.List[Decimal] = sorted([Decimal(y) for y in unique_ys]) + number_of_rows: int = len(ys) - 1 + number_of_cols: int = len(xs) - 1 # find neighbouring cells and join wherever appropriate ds: disjointset = disjointset() @@ -117,8 +131,8 @@ def _determine_table_cell_boundaries( for j in range(0, number_of_cols): ds.add((i, j)) - for c in range(0, len(xs) - 1): - for r in range(0, len(ys) - 1): + for c in range(0, number_of_cols): + for r in range(0, number_of_rows): if c + 2 < len(xs): logger.debug( "attempting to merge [%d %d] with its right neighbour" % (r, c) @@ -169,9 +183,7 @@ def _determine_table_cell_boundaries( # check whether all areas are rectangular for i in range(min_col, max_col): for j in range(min_row, max_row): - assert ( - j * number_of_rows + i - ) in v, "Non-rectangular area detected in table." + assert (i, j) in v, "Non-rectangular area detected in table." # create TableCell tc: TableCell = TableCell(