Skip to content

Commit

Permalink
Ingestion improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
vkrd committed Aug 13, 2024
1 parent 75cebf0 commit 1bcdbd9
Showing 1 changed file with 14 additions and 4 deletions.
18 changes: 14 additions & 4 deletions scripts/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -674,11 +674,14 @@ def extract_pdf_content(file_path, form_recognizer_client, use_layout=False):
page_number = bounding_box['pageNumber'] - 1 # Page numbers in PyMuPDF start from 0
x0, y0, x1, y1 = polygon_to_bbox(bounding_box['polygon'])

# Select the figure and upscale it by 200% for higher resolution
page = document.load_page(page_number)
bbox = fitz.Rect(x0, y0, x1, y1)

zoom = 2.0
# If either the width or height of the bounding box is less than 3 inches, we upscale by 2x
if bbox.width < 72*3 or bbox.height < 72*3:
zoom = 2.0
else:
zoom = 1.0
mat = fitz.Matrix(zoom, zoom)
image = page.get_pixmap(matrix=mat, clip=bbox)

Expand All @@ -687,17 +690,24 @@ def extract_pdf_content(file_path, form_recognizer_client, use_layout=False):
image_base64 = base64.b64encode(image_data).decode("utf-8")
image_base64 = f"data:image/jpg;base64,{image_base64}"

# Add the image tag to the full text
# Identify the text that corresponds to the figure
replace_start = figure["spans"][0]["offset"]
replace_end = figure["spans"][0]["offset"] + figure["spans"][0]["length"]

# Sometimes the figure doesn't correspond to any text, in which case we skip it
if replace_start == replace_end:
continue

# Now we get the image tag
original_text = form_recognizer_results.content[replace_start:replace_end]

if original_text not in full_text:
continue

img_tag = image_content_to_tag(original_text)

full_text = full_text.replace(original_text, img_tag)
# We replace only the first occurrence of the original text
full_text = full_text.replace(original_text, img_tag, 1)
image_mapping[img_tag] = image_base64

return full_text, image_mapping
Expand Down

0 comments on commit 1bcdbd9

Please sign in to comment.