@@ -514,25 +514,34 @@ def extract_cells(textpage, cell, markdown=False):
514514 Returns:
515515 A string with the text extracted from the cell.
516516 """
517+
518+ def outside_cell (bbox , cell ):
519+ return (
520+ 0
521+ or bbox [0 ] >= cell [2 ]
522+ or bbox [2 ] <= cell [0 ]
523+ or bbox [1 ] >= cell [3 ]
524+ or bbox [3 ] <= cell [1 ]
525+ )
526+
517527 text = ""
518528 for block in textpage .extractRAWDICT ()["blocks" ]:
519529 if block ["type" ] != 0 :
520530 continue
531+ if outside_cell (block ["bbox" ], cell ):
532+ continue
521533 for line in block ["lines" ]:
522- new_line = True
534+ if outside_cell (line ["bbox" ], cell ):
535+ continue
523536 if text : # must be a new line in the cell
524- if text .endswith ("$" ):
525- text += " "
526- elif text .endswith ("$ " ):
527- pass
528- else :
529- text += "<br>" if markdown else "\n "
537+ text += "<br>" if markdown else "\n "
530538
531539 # strikeout detection only works with horizontal text
532540 horizontal = line ["dir" ] == (0 , 1 ) or line ["dir" ] == (1 , 0 )
533541
534542 for span in line ["spans" ]:
535- sbbox = span ["bbox" ]
543+ if outside_cell (span ["bbox" ], cell ):
544+ continue
536545 # only include chars with more than 50% bbox overlap
537546 span_text = ""
538547 for char in span ["chars" ]:
@@ -576,7 +585,7 @@ def extract_cells(textpage, cell, markdown=False):
576585 text += " "
577586 else :
578587 text += prefix + span_text + suffix
579-
588+ text = text . replace ( "$<br>" , "$ " ). replace ( " $ <br>" , "$ " )
580589 return text .strip ()
581590
582591
@@ -635,3 +644,19 @@ def table_to_markdown(textpage, table_item, markdown=True):
635644 line += "\n "
636645 output += line
637646 return output + "\n "
647+
648+
649+ def table_extract (textpage , table_item ):
650+ table = table_item .table
651+ row_count = table ["row_count" ]
652+ col_count = table ["col_count" ]
653+ cell_boxes = table ["cells" ]
654+ # make empty cell text list
655+ cells = [[None for i in range (col_count )] for j in range (row_count )]
656+
657+ for i , row in enumerate (cell_boxes ):
658+ for j , cell in enumerate (row ):
659+ if cell is not None :
660+ cells [i ][j ] = extract_cells (textpage , cell_boxes [i ][j ], markdown = False )
661+
662+ return cells
0 commit comments