diff --git a/ai_ta_backend/image_extraction.py b/ai_ta_backend/image_extraction.py new file mode 100644 index 00000000..04c66e22 --- /dev/null +++ b/ai_ta_backend/image_extraction.py @@ -0,0 +1,44 @@ +import os +import fitz +from docx import Document +from pptx import Presentation +from tempfile import NamedTemporaryFile + +def extract_images_from_pdf(pdf_doc, s3_path): + """ + Extracts images from a pdf file and stores them in a folder. + """ + print("Extracting images from pdf...") + + try: + filename = s3_path.split("/")[-1] + course_name = s3_path.split("/")[-2] + # check for images directory + image_dir = "extracted_images" + folder_name = "images_from_" + filename + print("folder name: ", folder_name) + + if not os.path.exists(image_dir): + print("Creating directory for extracted images...") + os.makedirs(image_dir) + + if not os.path.exists(os.path.join(image_dir, folder_name)): + print("Creating directory for extracted images from this pdf...") + os.makedirs(os.path.join(image_dir, folder_name)) + folder_path = os.path.join(image_dir, folder_name) + + for i in range(len(pdf_doc)): + for img in pdf_doc.get_page_images(i): + xref = img[0] + base = os.path.splitext(pdf_doc.name)[0] + pix = fitz.Pixmap(pdf_doc, xref) + if pix.n < 5: + pix.save(os.path.join(folder_path, "page%s-%s.png" % (i, xref))) + else: + pix1 = fitz.Pixmap(fitz.csRGB, pix) + pix1.save(os.path.join(folder_path, "page%s-%s.png" % (i, xref))) + pix1 = None + pix = None + return "Success!" + except Exception as e: + return "Error extracting images from pdf: " + str(e) \ No newline at end of file diff --git a/ai_ta_backend/vector_database.py b/ai_ta_backend/vector_database.py index 5302e760..e38e4496 100644 --- a/ai_ta_backend/vector_database.py +++ b/ai_ta_backend/vector_database.py @@ -39,6 +39,7 @@ from ai_ta_backend.extreme_context_stuffing import OpenAIAPIProcessor from ai_ta_backend.utils_tokenization import count_tokens_and_cost +from ai_ta_backend.image_extraction import extract_images_from_pdf class Ingest(): """ @@ -478,6 +479,10 @@ def _ingest_single_pdf(self, s3_path: str, course_name: str, **kwargs): ### READ OCR of PDF doc = fitz.open(pdf_tmpfile.name) # type: ignore + # extract all images from the pdf + result = extract_images_from_pdf(doc, s3_path) + print("Image extraction result:", result) + # improve quality of the image zoom_x = 2.0 # horizontal zoom zoom_y = 2.0 # vertical zoom @@ -485,21 +490,11 @@ def _ingest_single_pdf(self, s3_path: str, course_name: str, **kwargs): pdf_pages_OCRed: List[Dict] = [] for i, page in enumerate(doc): # type: ignore - # image extraction - print("extracting image") - d = page.get_text("dict") - blocks = d['blocks'] - img_blocks = [b for b in blocks if b['type']==1] - print(img_blocks[0]) - exit() - - # UPLOAD FIRST PAGE IMAGE to S3 if i == 0: with NamedTemporaryFile(suffix=".png") as first_page_png: pix = page.get_pixmap(matrix=mat) pix.save(first_page_png) # store image as a PNG - s3_upload_path = str(Path(s3_path)).rsplit('.pdf')[0] + "-pg1-thumb.png" first_page_png.seek(0) # Seek the file pointer back to the beginning with open(first_page_png.name, 'rb') as f: