Skip to content

Commit

Permalink
modified pdf ingest function
Browse files Browse the repository at this point in the history
  • Loading branch information
star-nox committed Oct 19, 2023
1 parent 8499603 commit 4319578
Show file tree
Hide file tree
Showing 2 changed files with 122 additions and 40 deletions.
161 changes: 121 additions & 40 deletions ai_ta_backend/vector_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

import boto3
import fitz
from pypdf import PdfReader
import openai
import supabase
from bs4 import BeautifulSoup
Expand Down Expand Up @@ -457,29 +458,30 @@ def _ingest_single_srt(self, s3_path: str, course_name: str, **kwargs) -> str:
except Exception as e:
print(f"SRT ERROR {e}")
return f"Error: {e}"

def _ingest_single_pdf(self, s3_path: str, course_name: str, **kwargs):
"""
Both OCR the PDF. And grab the first image as a PNG.
LangChain `Documents` have .metadata and .page_content attributes.
Be sure to use TemporaryFile() to avoid memory leaks!
Grab the first page as an image and upload to S3.
Extract text from the doc.
"""
try:
with NamedTemporaryFile() as pdf_tmpfile:
# download from S3 into pdf_tmpfile
self.s3_client.download_fileobj(Bucket=os.getenv('S3_BUCKET_NAME'), Key=s3_path, Fileobj=pdf_tmpfile)
### READ OCR of PDF
doc = fitz.open(pdf_tmpfile.name) # type: ignore
reader = PdfReader(pdf_tmpfile.name)
no_of_pages = len(reader.pages)
page = reader.pages[0]
text = page.extract_text()
print("len of text: ", len(text))

# uploading the 1st page as a PNG file
doc = fitz.open(pdf_tmpfile.name)

# improve quality of the image
zoom_x = 2.0 # horizontal zoom
zoom_y = 2.0 # vertical zoom
mat = fitz.Matrix(zoom_x, zoom_y) # zoom factor 2 in each dimension

pdf_pages_OCRed: List[Dict] = []
for i, page in enumerate(doc): # type: ignore

# UPLOAD FIRST PAGE IMAGE to S3
for i, page in enumerate(doc):
if i == 0:
with NamedTemporaryFile(suffix=".png") as first_page_png:
pix = page.get_pixmap(matrix=mat)
Expand All @@ -490,10 +492,15 @@ def _ingest_single_pdf(self, s3_path: str, course_name: str, **kwargs):
with open(first_page_png.name, 'rb') as f:
print("Uploading image png to S3")
self.s3_client.upload_fileobj(f, os.getenv('S3_BUCKET_NAME'), s3_upload_path)

# Extract text
text = page.get_text().encode("utf8").decode('ascii', errors='ignore') # get plain text (is in UTF-8)
pdf_pages_OCRed.append(dict(text=text, page_number=i, readable_filename=Path(s3_path).name))
else:
break

# extracting text from the file
pdf_pages_extracted: List[Dict] = []
pages = reader.pages
for i, page in enumerate(pages):
text = page.extract_text()
pdf_pages_extracted.append(dict(text=text, page_number=i, readable_filename=Path(s3_path).name))

if kwargs['kwargs'] == {}:
url = ''
Expand All @@ -507,28 +514,99 @@ def _ingest_single_pdf(self, s3_path: str, course_name: str, **kwargs):
base_url = kwargs['kwargs']['base_url']
else:
base_url = ''


metadata: List[Dict[str, Any]] = [{
'course_name': course_name,
's3_path': s3_path,
'pagenumber': page['page_number'] + 1, # +1 for human indexing
'timestamp': '',
'readable_filename': page['readable_filename'],
'url': url,
'base_url': base_url,}
for page in pdf_pages_extracted]

metadatas: List[Dict[str, Any]] = [
{
'course_name': course_name,
's3_path': s3_path,
'pagenumber': page['page_number'] + 1, # +1 for human indexing
'timestamp': '',
'readable_filename': page['readable_filename'],
'url': url,
'base_url': base_url,
} for page in pdf_pages_OCRed
]
pdf_texts = [page['text'] for page in pdf_pages_OCRed]

self.split_and_upload(texts=pdf_texts, metadatas=metadatas)
pdf_texts = [page['text'] for page in pdf_pages_extracted]

self.split_and_upload(texts=pdf_texts, metadatas=metadata)
print("Success pdf ingest")

except Exception as e:
print("ERROR IN PDF READING ")
print(e)
return f"Error {e}"
return "Success"
print(f"ERROR IN PDF READING {e}")
return f"Error: {e}"

# def _ingest_single_pdf(self, s3_path: str, course_name: str, **kwargs):
# """
# Both OCR the PDF. And grab the first image as a PNG.
# LangChain `Documents` have .metadata and .page_content attributes.
# Be sure to use TemporaryFile() to avoid memory leaks!
# """
# try:
# with NamedTemporaryFile() as pdf_tmpfile:
# # download from S3 into pdf_tmpfile
# self.s3_client.download_fileobj(Bucket=os.getenv('S3_BUCKET_NAME'), Key=s3_path, Fileobj=pdf_tmpfile)
# ### READ OCR of PDF
# doc = fitz.open(pdf_tmpfile.name) # type: ignore

# # improve quality of the image
# zoom_x = 2.0 # horizontal zoom
# zoom_y = 2.0 # vertical zoom
# mat = fitz.Matrix(zoom_x, zoom_y) # zoom factor 2 in each dimension

# pdf_pages_OCRed: List[Dict] = []
# for i, page in enumerate(doc): # type: ignore

# # UPLOAD FIRST PAGE IMAGE to S3
# if i == 0:
# with NamedTemporaryFile(suffix=".png") as first_page_png:
# pix = page.get_pixmap(matrix=mat)
# pix.save(first_page_png) # store image as a PNG

# s3_upload_path = str(Path(s3_path)).rsplit('.pdf')[0] + "-pg1-thumb.png"
# first_page_png.seek(0) # Seek the file pointer back to the beginning
# with open(first_page_png.name, 'rb') as f:
# print("Uploading image png to S3")
# self.s3_client.upload_fileobj(f, os.getenv('S3_BUCKET_NAME'), s3_upload_path)

# # Extract text
# text = page.get_text().encode("utf8").decode('ascii', errors='ignore') # get plain text (is in UTF-8)
# print("len of text: ", len(text))
# #exit()
# pdf_pages_OCRed.append(dict(text=text, page_number=i, readable_filename=Path(s3_path).name))

# if kwargs['kwargs'] == {}:
# url = ''
# base_url = ''
# else:
# if 'url' in kwargs['kwargs'].keys():
# url = kwargs['kwargs']['url']
# else:
# url = ''
# if 'base_url' in kwargs['kwargs'].keys():
# base_url = kwargs['kwargs']['base_url']
# else:
# base_url = ''


# metadatas: List[Dict[str, Any]] = [
# {
# 'course_name': course_name,
# 's3_path': s3_path,
# 'pagenumber': page['page_number'] + 1, # +1 for human indexing
# 'timestamp': '',
# 'readable_filename': page['readable_filename'],
# 'url': url,
# 'base_url': base_url,
# } for page in pdf_pages_OCRed
# ]
# pdf_texts = [page['text'] for page in pdf_pages_OCRed]

# self.split_and_upload(texts=pdf_texts, metadatas=metadatas)
# print("Success pdf ingest")
# except Exception as e:
# print("ERROR IN PDF READING ")
# print(e)
# return f"Error {e}"
# return "Success"

def _ingest_single_txt(self, s3_path: str, course_name: str, **kwargs) -> str:
"""Ingest a single .txt or .md file from S3.
Expand Down Expand Up @@ -1148,16 +1226,19 @@ def check_for_duplicates(self, texts: List[str], metadatas: List[Dict[str, Any]]
supabase_contexts = supabase_contents.data[0]

for text in supabase_contexts['contexts']:
supabase_whole_text = supabase_whole_text + " " + text['text']
print("supabase_whole_text length: ", len(supabase_whole_text.split()))
supabase_whole_text = " ".join(supabase_whole_text.split())
supabase_whole_text += text['text']

# print("supabase_whole_text length: ", len(supabase_whole_text.split()))
# supabase_whole_text = " ".join(supabase_whole_text.split())
print("supabase_whole_text: ", len(supabase_whole_text))

# compare with current texts
current_whole_text = ""
for text in texts:
current_whole_text = current_whole_text + " " + text
print("current_whole_text length: ", len(current_whole_text.split()))
current_whole_text = " ".join(current_whole_text.split())
current_whole_text += text
# print("current_whole_text length: ", len(current_whole_text.split()))
# current_whole_text = " ".join(current_whole_text.split())
print("current_whole_text: ", len(current_whole_text))

if supabase_whole_text == current_whole_text:
print(f"The file 📄: {filename} is a duplicate!")
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ cs-dlp @ git+https://github.com/raffaem/[email protected] # previously called
GitPython
nomic
flask-executor
pypdf

# No arize for now, huge build size with these additions.
# arize[AutoEmbeddings, LLM_Evaluation]

0 comments on commit 4319578

Please sign in to comment.