Skip to content

Commit

Permalink
fixed PDF duplicate issue
Browse files Browse the repository at this point in the history
  • Loading branch information
star-nox committed Oct 20, 2023
1 parent 4319578 commit 0daac23
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 137 deletions.
190 changes: 53 additions & 137 deletions ai_ta_backend/vector_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,7 @@ def _ingest_single_py(self, s3_path: str, course_name: str, **kwargs):
os.remove(file_path)

success_or_failure = self.split_and_upload(texts=texts, metadatas=metadatas)
print("Python ingest: ", success_or_failure)
return success_or_failure

except Exception as e:
Expand Down Expand Up @@ -406,7 +407,7 @@ def _ingest_single_video(self, s3_path: str, course_name: str, **kwargs) -> str:
def _ingest_single_docx(self, s3_path: str, course_name: str, **kwargs) -> str:
try:
with NamedTemporaryFile() as tmpfile:
# download from S3 into pdf_tmpfile
# download from S3 into tmpfile
print("Bucket: ", os.getenv('S3_BUCKET_NAME'))
print("Key: ", s3_path)
self.s3_client.download_fileobj(Bucket=os.getenv('S3_BUCKET_NAME'), Key=s3_path, Fileobj=tmpfile)
Expand Down Expand Up @@ -459,29 +460,29 @@ def _ingest_single_srt(self, s3_path: str, course_name: str, **kwargs) -> str:
print(f"SRT ERROR {e}")
return f"Error: {e}"


def _ingest_single_pdf(self, s3_path: str, course_name: str, **kwargs):
"""
Grab the first page as an image and upload to S3.
Extract text from the doc.
Both OCR the PDF. And grab the first image as a PNG.
LangChain `Documents` have .metadata and .page_content attributes.
Be sure to use TemporaryFile() to avoid memory leaks!
"""
try:
with NamedTemporaryFile() as pdf_tmpfile:
# download from S3 into pdf_tmpfile
self.s3_client.download_fileobj(Bucket=os.getenv('S3_BUCKET_NAME'), Key=s3_path, Fileobj=pdf_tmpfile)
reader = PdfReader(pdf_tmpfile.name)
no_of_pages = len(reader.pages)
page = reader.pages[0]
text = page.extract_text()
print("len of text: ", len(text))

# uploading the 1st page as a PNG file
doc = fitz.open(pdf_tmpfile.name)
### READ OCR of PDF
doc = fitz.open(pdf_tmpfile.name) # type: ignore

# improve quality of the image
zoom_x = 2.0 # horizontal zoom
zoom_y = 2.0 # vertical zoom
mat = fitz.Matrix(zoom_x, zoom_y) # zoom factor 2 in each dimension

for i, page in enumerate(doc):
pdf_pages_OCRed: List[Dict] = []
for i, page in enumerate(doc): # type: ignore

# UPLOAD FIRST PAGE IMAGE to S3
if i == 0:
with NamedTemporaryFile(suffix=".png") as first_page_png:
pix = page.get_pixmap(matrix=mat)
Expand All @@ -492,15 +493,10 @@ def _ingest_single_pdf(self, s3_path: str, course_name: str, **kwargs):
with open(first_page_png.name, 'rb') as f:
print("Uploading image png to S3")
self.s3_client.upload_fileobj(f, os.getenv('S3_BUCKET_NAME'), s3_upload_path)
else:
break

# extracting text from the file
pdf_pages_extracted: List[Dict] = []
pages = reader.pages
for i, page in enumerate(pages):
text = page.extract_text()
pdf_pages_extracted.append(dict(text=text, page_number=i, readable_filename=Path(s3_path).name))

# Extract text
text = page.get_text().encode("utf8").decode('ascii', errors='ignore') # get plain text (is in UTF-8)
pdf_pages_OCRed.append(dict(text=text, page_number=i, readable_filename=Path(s3_path).name))

if kwargs['kwargs'] == {}:
url = ''
Expand All @@ -514,100 +510,30 @@ def _ingest_single_pdf(self, s3_path: str, course_name: str, **kwargs):
base_url = kwargs['kwargs']['base_url']
else:
base_url = ''

metadata: List[Dict[str, Any]] = [{
'course_name': course_name,
's3_path': s3_path,
'pagenumber': page['page_number'] + 1, # +1 for human indexing
'timestamp': '',
'readable_filename': page['readable_filename'],
'url': url,
'base_url': base_url,}
for page in pdf_pages_extracted]

pdf_texts = [page['text'] for page in pdf_pages_extracted]

self.split_and_upload(texts=pdf_texts, metadatas=metadata)
print("Success pdf ingest")

except Exception as e:
print(f"ERROR IN PDF READING {e}")
return f"Error: {e}"

# def _ingest_single_pdf(self, s3_path: str, course_name: str, **kwargs):
# """
# Both OCR the PDF. And grab the first image as a PNG.
# LangChain `Documents` have .metadata and .page_content attributes.
# Be sure to use TemporaryFile() to avoid memory leaks!
# """
# try:
# with NamedTemporaryFile() as pdf_tmpfile:
# # download from S3 into pdf_tmpfile
# self.s3_client.download_fileobj(Bucket=os.getenv('S3_BUCKET_NAME'), Key=s3_path, Fileobj=pdf_tmpfile)
# ### READ OCR of PDF
# doc = fitz.open(pdf_tmpfile.name) # type: ignore

# # improve quality of the image
# zoom_x = 2.0 # horizontal zoom
# zoom_y = 2.0 # vertical zoom
# mat = fitz.Matrix(zoom_x, zoom_y) # zoom factor 2 in each dimension

# pdf_pages_OCRed: List[Dict] = []
# for i, page in enumerate(doc): # type: ignore

# # UPLOAD FIRST PAGE IMAGE to S3
# if i == 0:
# with NamedTemporaryFile(suffix=".png") as first_page_png:
# pix = page.get_pixmap(matrix=mat)
# pix.save(first_page_png) # store image as a PNG

# s3_upload_path = str(Path(s3_path)).rsplit('.pdf')[0] + "-pg1-thumb.png"
# first_page_png.seek(0) # Seek the file pointer back to the beginning
# with open(first_page_png.name, 'rb') as f:
# print("Uploading image png to S3")
# self.s3_client.upload_fileobj(f, os.getenv('S3_BUCKET_NAME'), s3_upload_path)

# # Extract text
# text = page.get_text().encode("utf8").decode('ascii', errors='ignore') # get plain text (is in UTF-8)
# print("len of text: ", len(text))
# #exit()
# pdf_pages_OCRed.append(dict(text=text, page_number=i, readable_filename=Path(s3_path).name))

# if kwargs['kwargs'] == {}:
# url = ''
# base_url = ''
# else:
# if 'url' in kwargs['kwargs'].keys():
# url = kwargs['kwargs']['url']
# else:
# url = ''
# if 'base_url' in kwargs['kwargs'].keys():
# base_url = kwargs['kwargs']['base_url']
# else:
# base_url = ''


# metadatas: List[Dict[str, Any]] = [
# {
# 'course_name': course_name,
# 's3_path': s3_path,
# 'pagenumber': page['page_number'] + 1, # +1 for human indexing
# 'timestamp': '',
# 'readable_filename': page['readable_filename'],
# 'url': url,
# 'base_url': base_url,
# } for page in pdf_pages_OCRed
# ]
# pdf_texts = [page['text'] for page in pdf_pages_OCRed]

# self.split_and_upload(texts=pdf_texts, metadatas=metadatas)
# print("Success pdf ingest")
# except Exception as e:
# print("ERROR IN PDF READING ")
# print(e)
# return f"Error {e}"
# return "Success"
metadatas: List[Dict[str, Any]] = [
{
'course_name': course_name,
's3_path': s3_path,
'pagenumber': page['page_number'] + 1, # +1 for human indexing
'timestamp': '',
'readable_filename': page['readable_filename'],
'url': url,
'base_url': base_url,
} for page in pdf_pages_OCRed
]
pdf_texts = [page['text'] for page in pdf_pages_OCRed]

success_or_failure = self.split_and_upload(texts=pdf_texts, metadatas=metadatas)
print("PDF message: ", success_or_failure)
return success_or_failure
except Exception as e:
print("ERROR IN PDF READING ")
print(e)
return f"Error {e}"


def _ingest_single_txt(self, s3_path: str, course_name: str, **kwargs) -> str:
"""Ingest a single .txt or .md file from S3.
Args:
Expand Down Expand Up @@ -797,11 +723,6 @@ def split_and_upload(self, texts: List[str], metadatas: List[Dict[str, Any]]):
assert len(texts) == len(metadatas), f'must have equal number of text strings and metadata dicts. len(texts) is {len(texts)}. len(metadatas) is {len(metadatas)}'

try:
# check for duplicates
is_duplicate = self.check_for_duplicates(texts, metadatas)
if is_duplicate:
return "🚫🚫 Duplicate, ingest skipped.🚫🚫"

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
chunk_size=1000,
chunk_overlap=150,
Expand All @@ -810,6 +731,15 @@ def split_and_upload(self, texts: List[str], metadatas: List[Dict[str, Any]]):
contexts: List[Document] = text_splitter.create_documents(texts=texts, metadatas=metadatas)
input_texts = [{'input': context.page_content, 'model': 'text-embedding-ada-002'} for context in contexts]

# check for duplicates
is_duplicate = self.check_for_duplicates(input_texts, metadatas)
print("is_duplicate: ", is_duplicate)
if is_duplicate:
print("split_and_upload returning duplicate")
return "🚫🚫 Duplicate, ingest skipped.🚫🚫"

print("split_and_upload continuing...")

oai = OpenAIAPIProcessor(input_prompts_list=input_texts,
request_url='https://api.openai.com/v1/embeddings',
api_key=os.getenv('OPENAI_API_KEY'),
Expand Down Expand Up @@ -1195,11 +1125,13 @@ def format_for_json(self, found_docs: List[Document]) -> List[Dict]:

return contexts

def check_for_duplicates(self, texts: List[str], metadatas: List[Dict[str, Any]]) -> bool:
def check_for_duplicates(self, texts: List[Dict], metadatas: List[Dict[str, Any]]) -> bool:
"""
For given metadata, fetch docs from Supabase based on S3 path or URL.
If docs exists, concatenate the texts and compare with current texts, if same, return True.
"""
print("in check_for_duplicates")

doc_table = os.getenv('NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE')
course_name = metadatas[0]['course_name']
s3_path = metadatas[0]['s3_path']
Expand All @@ -1224,35 +1156,19 @@ def check_for_duplicates(self, texts: List[str], metadatas: List[Dict[str, Any]]
if len(supabase_contents.data) > 0:
# concatenate texts
supabase_contexts = supabase_contents.data[0]

for text in supabase_contexts['contexts']:
supabase_whole_text += text['text']

# print("supabase_whole_text length: ", len(supabase_whole_text.split()))
# supabase_whole_text = " ".join(supabase_whole_text.split())
print("supabase_whole_text: ", len(supabase_whole_text))

# compare with current texts
current_whole_text = ""
for text in texts:
current_whole_text += text
# print("current_whole_text length: ", len(current_whole_text.split()))
# current_whole_text = " ".join(current_whole_text.split())
print("current_whole_text: ", len(current_whole_text))

current_whole_text += text['input']

# compare with current texts
if supabase_whole_text == current_whole_text:
print(f"The file 📄: {filename} is a duplicate!")
return True
else:
print(f"The file 📄: {filename} is NOT a duplicate!")
print("supabase_whole_text: ", len(supabase_whole_text))
print("\n\n")
print("current_whole_text: ", len(current_whole_text))
print("\n\n")
with open("supabase_whole_text.txt", "w") as f:
f.write(supabase_whole_text)
with open("current_whole_text.txt", "w") as f:
f.write(current_whole_text)
return False
else:
print(f"File 📄: {filename} is NOT a duplicate!")
Expand Down
19 changes: 19 additions & 0 deletions current_whole_text.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
Course Title: Advanced Machine Learning: Algorithms, Theory, and ApplicationsLevel: Graduate
Course Description:This graduate-level course delves into the advanced concepts, algorithms, and theoretical foundations of machine learning. It covers a broad spectrum of topics, including deep learning, reinforcement learning, probabilistic graphical models, and more. Students will gain an in-depth understanding of the theoretical underpinnings behind various machine learning techniques, as well as hands-on experience in applying these techniques to real-world problems. The course emphasizes critical thinking, problem-solving, and the ability to adapt machine learning methods to new challenges.
Prerequisites:- Undergraduate-level machine learning course or equivalent knowledge.- Proficiency in programming (Python preferred).- Linear algebra, calculus, and probability/statistics.
Instructor:[Instructor Name][Instructor Contact Information]
Course Objectives:By the end of the course, students will be able to:1. Understand the theoretical foundations of advanced machine learning algorithms.2. Analyze and critically evaluate the strengths and limitations of different machine learning methods.3. Implement and fine-tune complex machine learning models for various applications.4. Apply machine learning techniques to real-world datasets, addressing practical challenges.5. Stay updated with recent advancements in the field and adapt them to novel problems.
Grading Components:- Assignments: 40%- Midterm Exam: 20%- Final Project: 30%- Class Participation: 10%
Textbooks:1. "Pattern Recognition and Machine Learning" by Christopher M. Bishop2. "Deep Learning" by Ian Goodfellow, Yoshua Bengio, and Aaron Courville3. Additional research papers and online resources
Course Outline:
Module 1: Fundamentals of Advanced Machine Learning- Review of basic concepts in machine learning- Bias-variance tradeoff and model complexity- Regularization techniques and their applications- Model selection and evaluation strategies
Module 2: Probabilistic Graphical Models- Bayesian networks and inference- Hidden Markov models- Conditional random fields- Latent Dirichlet allocation (LDA) and topic modeling
Module 3: Deep Learning: Architectures and Training- Neural network fundamentals and activation functions- Convolutional neural networks (CNNs) for image analysis- Recurrent neural networks (RNNs) and sequence modeling- Training techniques: optimization, dropout, batch normalization
Module 4: Generative Models- Variational autoencoders (VAEs)- Generative adversarial networks (GANs)- Applications of generative models in data synthesis and augmentation
Module 5: Reinforcement Learning- Markov decision processes (MDPs)- Policy gradients and actor-critic methods- Q-learning and deep Q-networks (DQNs)- Applications in game playing and robotic control
Module 6: Advanced Topics in Machine Learning- Transfer learning and domain adaptation- Explainable AI and interpretability- Fairness and ethics in machine learning- Recent advancements in the field (attention mechanisms, transformers, etc.)
Module 7: Final ProjectStudents will work on a semester-long project, applying the concepts learned to solve a real-world problem of their choice. The project will include problem formulation, data preprocessing, model selection, implementation, and a final presentation.
Note: The syllabus is subject to change based on the instructor's discretion and the evolving landscape of machine learning research.

--> Adding some new text to check file update!

18 changes: 18 additions & 0 deletions supabase_whole_text.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
Course Title: Advanced Machine Learning: Algorithms, Theory, and ApplicationsLevel: Graduate
Course Description:This graduate-level course delves into the advanced concepts, algorithms, and theoretical foundations of machine learning. It covers a broad spectrum of topics, including deep learning, reinforcement learning, probabilistic graphical models, and more. Students will gain an in-depth understanding of the theoretical underpinnings behind various machine learning techniques, as well as hands-on experience in applying these techniques to real-world problems. The course emphasizes critical thinking, problem-solving, and the ability to adapt machine learning methods to new challenges.
Prerequisites:- Undergraduate-level machine learning course or equivalent knowledge.- Proficiency in programming (Python preferred).- Linear algebra, calculus, and probability/statistics.
Instructor:[Instructor Name][Instructor Contact Information]
Course Objectives:By the end of the course, students will be able to:1. Understand the theoretical foundations of advanced machine learning algorithms.2. Analyze and critically evaluate the strengths and limitations of different machine learning methods.3. Implement and fine-tune complex machine learning models for various applications.4. Apply machine learning techniques to real-world datasets, addressing practical challenges.5. Stay updated with recent advancements in the field and adapt them to novel problems.
Grading Components:- Assignments: 40%- Midterm Exam: 20%- Final Project: 30%- Class Participation: 10%
Textbooks:1. "Pattern Recognition and Machine Learning" by Christopher M. Bishop2. "Deep Learning" by Ian Goodfellow, Yoshua Bengio, and Aaron Courville3. Additional research papers and online resources
Course Outline:
Module 1: Fundamentals of Advanced Machine Learning- Review of basic concepts in machine learning- Bias-variance tradeoff and model complexity- Regularization techniques and their applications- Model selection and evaluation strategies
Module 2: Probabilistic Graphical Models- Bayesian networks and inference- Hidden Markov models- Conditional random fields- Latent Dirichlet allocation (LDA) and topic modeling
Module 3: Deep Learning: Architectures and Training- Neural network fundamentals and activation functions- Convolutional neural networks (CNNs) for image analysis- Recurrent neural networks (RNNs) and sequence modeling- Training techniques: optimization, dropout, batch normalization
Module 4: Generative Models- Variational autoencoders (VAEs)- Generative adversarial networks (GANs)- Applications of generative models in data synthesis and augmentation
Module 5: Reinforcement Learning- Markov decision processes (MDPs)- Policy gradients and actor-critic methods- Q-learning and deep Q-networks (DQNs)- Applications in game playing and robotic control
Module 6: Advanced Topics in Machine Learning- Transfer learning and domain adaptation- Explainable AI and interpretability- Fairness and ethics in machine learning- Recent advancements in the field (attention mechanisms, transformers, etc.)
Module 7: Final ProjectStudents will work on a semester-long project, applying the concepts learned to solve a real-world problem of their choice. The project will include problem formulation, data preprocessing, model selection, implementation, and a final presentation.
Note: The syllabus is subject to change based on the instructor's discretion and the evolving landscape of machine learning research.

--> Adding some new text to check file update!

0 comments on commit 0daac23

Please sign in to comment.