-
Notifications
You must be signed in to change notification settings - Fork 0
/
pdf_util.py
84 lines (72 loc) · 2.43 KB
/
pdf_util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import os
import fitz # PyMuPDF for PDF handling
from PIL import Image
import io
from dotenv import load_dotenv
import file_search
# 1. search for text inside pdf
# 2. annotate text create another pdf
# 3. convert output into image
def pdf_search_annotate(pdf_file,search_text):
search_annotate(pdf_file,search_text)
output_file="highlighted_output.pdf"
annotated = pdf_to_image_task(output_file)
return annotated
# pdf upload task and convert to image
def pdf_to_image_task(pdf_file):
# check if pdf file is provided
if not pdf_file:
raise ValueError("No pdf file provided , please upload the same")
# Open the PDF from the uploaded file
doc = fitz.open(pdf_file)
images = []
# Iterate through all the pages and convert them to images
for page_num in range(len(doc)):
page = doc.load_page(page_num)
pix = page.get_pixmap()
img = Image.open(io.BytesIO(pix.tobytes("png")))
# Append the image to the list
images.append(img)
return images
# search & annotate pdf
def search_annotate(pdf_file,search_text):
doc = fitz.open(pdf_file)
print(f"search text {search_text}")
for page_num in range(len(doc)):
page = doc.load_page(page_num)
# Search for text and highlight
text_instances = page.search_for(search_text)
for inst in text_instances:
print("found instance of search_text")
highlight = page.add_highlight_annot(inst)
# Save the document with highlights
doc.save("highlighted_output.pdf")
doc.close()
return doc
# Compare resumes
# 1.upload both resumes to vector store
# 2.run query
# 3.get output
def compare_resume(resume1,resume2,search_query):
load_dotenv()
vector_store_name = os.getenv("resume_compare")
print(f"vector store name {vector_store_name}")
file_list = [resume1,resume2]
file_search.add_files(vector_store_name,file_list)
return None
def concatenate_pdfs(pdf1,pdf2):
output_file = "pdf_concat.pdf"
pdf_files = [ pdf1,pdf2]
# Create a new PDF document
output_pdf = fitz.open()
for pdf_file in pdf_files:
# Open the current PDF
pdf_document = fitz.open(pdf_file)
# Insert the entire PDF into the output PDF
output_pdf.insert_pdf(pdf_document)
# Close the current PDF
pdf_document.close()
# Save the concatenated PDF to a file
output_pdf.save(output_file)
# return output file
return output_file