Skip to content

Commit

Permalink
added image extraction code to pdf ingest for testing
Browse files Browse the repository at this point in the history
  • Loading branch information
star-nox committed Nov 20, 2023
1 parent 98bbdc9 commit 48e968f
Showing 1 changed file with 9 additions and 1 deletion.
10 changes: 9 additions & 1 deletion ai_ta_backend/vector_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -477,14 +477,22 @@ def _ingest_single_pdf(self, s3_path: str, course_name: str, **kwargs):
self.s3_client.download_fileobj(Bucket=os.getenv('S3_BUCKET_NAME'), Key=s3_path, Fileobj=pdf_tmpfile)
### READ OCR of PDF
doc = fitz.open(pdf_tmpfile.name) # type: ignore

# improve quality of the image
zoom_x = 2.0 # horizontal zoom
zoom_y = 2.0 # vertical zoom
mat = fitz.Matrix(zoom_x, zoom_y) # zoom factor 2 in each dimension

pdf_pages_OCRed: List[Dict] = []
for i, page in enumerate(doc): # type: ignore
# image extraction
print("extracting image")
d = page.get_text("dict")
blocks = d['blocks']
img_blocks = [b for b in blocks if b['type']==1]
print(img_blocks[0])
exit()


# UPLOAD FIRST PAGE IMAGE to S3
if i == 0:
Expand Down

0 comments on commit 48e968f

Please sign in to comment.