Skip to content

Commit

Permalink
created function for pdf extraction
Browse files Browse the repository at this point in the history
  • Loading branch information
star-nox committed Nov 21, 2023
1 parent 48e968f commit 82b0fa4
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 10 deletions.
44 changes: 44 additions & 0 deletions ai_ta_backend/image_extraction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import os
import fitz
from docx import Document
from pptx import Presentation
from tempfile import NamedTemporaryFile

def extract_images_from_pdf(pdf_doc, s3_path):
"""
Extracts images from a pdf file and stores them in a folder.
"""
print("Extracting images from pdf...")

try:
filename = s3_path.split("/")[-1]
course_name = s3_path.split("/")[-2]
# check for images directory
image_dir = "extracted_images"
folder_name = "images_from_" + filename
print("folder name: ", folder_name)

if not os.path.exists(image_dir):
print("Creating directory for extracted images...")
os.makedirs(image_dir)

if not os.path.exists(os.path.join(image_dir, folder_name)):
print("Creating directory for extracted images from this pdf...")
os.makedirs(os.path.join(image_dir, folder_name))
folder_path = os.path.join(image_dir, folder_name)

for i in range(len(pdf_doc)):
for img in pdf_doc.get_page_images(i):
xref = img[0]
base = os.path.splitext(pdf_doc.name)[0]
pix = fitz.Pixmap(pdf_doc, xref)
if pix.n < 5:
pix.save(os.path.join(folder_path, "page%s-%s.png" % (i, xref)))
else:
pix1 = fitz.Pixmap(fitz.csRGB, pix)
pix1.save(os.path.join(folder_path, "page%s-%s.png" % (i, xref)))
pix1 = None
pix = None
return "Success!"
except Exception as e:
return "Error extracting images from pdf: " + str(e)
15 changes: 5 additions & 10 deletions ai_ta_backend/vector_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
from ai_ta_backend.extreme_context_stuffing import OpenAIAPIProcessor
from ai_ta_backend.utils_tokenization import count_tokens_and_cost

from ai_ta_backend.image_extraction import extract_images_from_pdf

class Ingest():
"""
Expand Down Expand Up @@ -478,28 +479,22 @@ def _ingest_single_pdf(self, s3_path: str, course_name: str, **kwargs):
### READ OCR of PDF
doc = fitz.open(pdf_tmpfile.name) # type: ignore

# extract all images from the pdf
result = extract_images_from_pdf(doc, s3_path)
print("Image extraction result:", result)

# improve quality of the image
zoom_x = 2.0 # horizontal zoom
zoom_y = 2.0 # vertical zoom
mat = fitz.Matrix(zoom_x, zoom_y) # zoom factor 2 in each dimension

pdf_pages_OCRed: List[Dict] = []
for i, page in enumerate(doc): # type: ignore
# image extraction
print("extracting image")
d = page.get_text("dict")
blocks = d['blocks']
img_blocks = [b for b in blocks if b['type']==1]
print(img_blocks[0])
exit()


# UPLOAD FIRST PAGE IMAGE to S3
if i == 0:
with NamedTemporaryFile(suffix=".png") as first_page_png:
pix = page.get_pixmap(matrix=mat)
pix.save(first_page_png) # store image as a PNG

s3_upload_path = str(Path(s3_path)).rsplit('.pdf')[0] + "-pg1-thumb.png"
first_page_png.seek(0) # Seek the file pointer back to the beginning
with open(first_page_png.name, 'rb') as f:
Expand Down

0 comments on commit 82b0fa4

Please sign in to comment.