Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

File update #99

Merged
merged 47 commits into from
Dec 12, 2023
Merged
Show file tree
Hide file tree
Changes from 37 commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
c42f606
added the add_users() for Canvas
star-nox Aug 10, 2023
6854205
added canvas course ingest
star-nox Aug 13, 2023
54e3fb0
updated requirements
star-nox Aug 13, 2023
07238a2
added .md ingest and fixed .py ingest
star-nox Aug 15, 2023
deceb15
deleted test ipynb file
star-nox Aug 15, 2023
27383e1
added nomic viz
star-nox Aug 16, 2023
6f08340
added canvas file update function
Aug 21, 2023
34cbbdc
completed update function
star-nox Aug 25, 2023
efd9048
updated course export to include all contents
star-nox Aug 25, 2023
bf3726b
modified to handle diff file structures of downloaded content
star-nox Aug 25, 2023
93646ac
modified canvas update
Aug 30, 2023
05ab444
modified add_users() and ingest_course_content() functions
Sep 21, 2023
f5655ab
modified ingest function
star-nox Sep 21, 2023
6f80b96
modified update_files() for file replacement
star-nox Sep 22, 2023
0223a22
removed the extra os.remove()
star-nox Sep 22, 2023
2e10cc8
fix underscore to dash in for pip
KastanDay Sep 29, 2023
a38fb90
removed json import and added abort to canvas functions
star-nox Oct 2, 2023
79142c5
Merge branch 'main' into canvas
star-nox Oct 2, 2023
118b725
created separate PR for file update
star-nox Oct 2, 2023
35a50a8
added file-update logic in ingest, WIP
star-nox Oct 11, 2023
8499603
removed irrelevant text files
star-nox Oct 11, 2023
4319578
modified pdf ingest function
star-nox Oct 19, 2023
0daac23
fixed PDF duplicate issue
star-nox Oct 20, 2023
dd05d51
removed unwanted files
star-nox Oct 20, 2023
c92aea2
updated nomic version in requirements.txt
star-nox Nov 6, 2023
e11fc6e
Merge branch 'main' of https://github.com/UIUC-Chatbot/ai-ta-backend
star-nox Nov 6, 2023
c01d1bc
Merge branch 'main' of https://github.com/UIUC-Chatbot/ai-ta-backend
star-nox Nov 8, 2023
31002ed
modified s3_paths
star-nox Nov 15, 2023
21f64fb
Merge branch 'main' into file-update
star-nox Nov 15, 2023
0a0e870
testing unique filenames in aws upload
star-nox Nov 16, 2023
bcefb36
added missing library to requirements.txt
star-nox Nov 16, 2023
3bda544
finished check_for_duplicates()
star-nox Nov 16, 2023
b63ca84
fixed filename errors
star-nox Nov 16, 2023
273d598
Merge branch 'main' into file-update
star-nox Nov 16, 2023
a1e0f4b
minor corrections
star-nox Nov 16, 2023
290c616
added a uuid check in check_for_duplicates()
star-nox Nov 20, 2023
7a5cc3a
Merge branch 'main' into file-update
star-nox Nov 21, 2023
bd73036
regex depends on this being a dash
KastanDay Dec 11, 2023
2a6f4b2
regex depends on this being a dash
KastanDay Dec 11, 2023
a1b4127
Fix bug when no duplicate exists.
KastanDay Dec 12, 2023
e01ee11
cleaning up prints, testing looks good. ready to merge
KastanDay Dec 12, 2023
154d45b
Further print and logging refinement
KastanDay Dec 12, 2023
f7ee763
Remove s3 pased method for de-duplication, use Supabase only
KastanDay Dec 12, 2023
2b43ab0
remove duplicate imports
KastanDay Dec 12, 2023
36145d3
remove new requirement
KastanDay Dec 12, 2023
b76b449
Final print cleanups
KastanDay Dec 12, 2023
c42ff61
remove pypdf import
KastanDay Dec 12, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions ai_ta_backend/aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from multiprocessing import Lock, cpu_count
from multiprocessing.pool import ThreadPool
from typing import List, Optional

import uuid
import boto3


Expand Down Expand Up @@ -38,7 +38,12 @@ def upload_data_files_to_s3(course_name: str, localdir: str) -> Optional[List[st
s3_paths_lock = Lock()

def upload(myfile):
s3_file = f"courses/{course_name}/{os.path.basename(myfile)}"
# get the last part of the path and append unique ID before it
directory, old_filename = os.path.split(myfile)
new_filename = str(uuid.uuid4()) + '_' + old_filename
new_filepath = os.path.join(directory, new_filename)

s3_file = f"courses/{course_name}/{os.path.basename(new_filepath)}"
s3.upload_file(myfile, os.getenv('S3_BUCKET_NAME'), s3_file)
with s3_paths_lock:
s3_paths.append(s3_file)
Expand Down
1 change: 1 addition & 0 deletions ai_ta_backend/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from ai_ta_backend.nomic_logging import get_nomic_map, log_convo_to_nomic
from ai_ta_backend.vector_database import Ingest
from ai_ta_backend.web_scrape import WebScrape, mit_course_download
from ai_ta_backend.canvas import CanvasAPI
from ai_ta_backend.export_data import export_convo_history_csv

app = Flask(__name__)
Expand Down
92 changes: 92 additions & 0 deletions ai_ta_backend/update_materials.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import os
import shutil
import boto3
from ai_ta_backend.vector_database import Ingest
import hashlib
from ai_ta_backend.aws import upload_data_files_to_s3
from ai_ta_backend.vector_database import Ingest


def generate_checksum(file_path):
md5_hash = hashlib.md5()
with open(file_path, "rb") as file:
# Read and update the hash string value in blocks
for byte_block in iter(lambda: file.read(4096), b""):
md5_hash.update(byte_block)
return md5_hash.hexdigest()

# need to modify this function - switch to Supabase instead of S3
def update_files(source_path: str, course_name: str):
"""
Compares and updates files in S3 and QDRANT
Args:
source_path: path to the directory containing the files to be uploaded
course_name: name of the course whose files need to be updated
To-do:
1. Get S3 paths of files for given course_name
2. Compute checksums of every file in source_path folder
3. Compare checksums with S3 files - if different, upload to S3 and ingest into QDRANT
"""
print("In update_files")

ingester = Ingest()
# Get S3 paths of files for given course_name
s3_files = ingester.getAll(course_name)
print("s3 files: ", s3_files)


# Access checksum of s3 files
s3_client = boto3.client('s3', aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'),
aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'),)

# Compute checksum of every file in source_path folder
total_files = 0
files_removed = 0

files = os.listdir(source_path)

for file in files:
filepath = os.path.join(source_path, file)
total_files += 1
#print("file: ", file)
#print("filepath: ", filepath)
file_checksum = generate_checksum(filepath)

# compare this checksum with checksum of all s3 files
for s3_file in s3_files:
s3_path = s3_file['s3_path']
#print("existing s3 file: ", s3_path)

s3_object = s3_client.get_object(Bucket=os.getenv('S3_BUCKET_NAME'), Key=s3_path)
s3_checksum = s3_object['ETag']

# remove file from the folder if checksums match
if str(file_checksum) == s3_checksum[1:-1]:
print("checksums match: ", filepath)
os.remove(filepath)
files_removed += 1
continue

# different checksums but same file name - delete the file from s3
elif str(file_checksum) != s3_checksum[1:-1] and file == s3_path.split('/')[-1]:
print("in replace file condition: ", file)
delete_s3_file = ingester.delete_data(s3_path, course_name)
print("deletion update: ", delete_s3_file)
s3_files.remove(s3_file)
break

print("total files: ", total_files)
print("files removed: ", files_removed)

if total_files - files_removed > 0:
# Upload files to S3 and ingest
new_s3_paths = upload_data_files_to_s3(course_name, source_path)
file_ingest = ingester.bulk_ingest(new_s3_paths, course_name=course_name)

# Delete files from local directory
shutil.rmtree(source_path)

return "Success"



114 changes: 98 additions & 16 deletions ai_ta_backend/vector_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,15 @@
import time
import traceback
import uuid
import re
from importlib import metadata
from pathlib import Path
from tempfile import NamedTemporaryFile
from typing import Any, Callable, Dict, List, Optional, Tuple, Union

import boto3
import fitz
from pypdf import PdfReader
import openai
import pytesseract
import supabase
Expand Down Expand Up @@ -167,7 +169,7 @@ def _ingest_single_py(self, s3_path: str, course_name: str, **kwargs):
metadatas: List[Dict[str, Any]] = [{
'course_name': course_name,
's3_path': s3_path,
'readable_filename': kwargs.get('readable_filename', Path(s3_path).name),
'readable_filename': kwargs.get('readable_filename', Path(s3_path).name[37:]),
'pagenumber': '',
'timestamp': '',
'url': '',
Expand All @@ -177,6 +179,7 @@ def _ingest_single_py(self, s3_path: str, course_name: str, **kwargs):
os.remove(file_path)

success_or_failure = self.split_and_upload(texts=texts, metadatas=metadatas)
print("Python ingest: ", success_or_failure)
return success_or_failure

except Exception as e:
Expand All @@ -199,7 +202,7 @@ def _ingest_single_vtt(self, s3_path: str, course_name: str, **kwargs):
metadatas: List[Dict[str, Any]] = [{
'course_name': course_name,
's3_path': s3_path,
'readable_filename': kwargs.get('readable_filename', Path(s3_path).name),
'readable_filename': kwargs.get('readable_filename', Path(s3_path).name[37:]),
'pagenumber': '',
'timestamp': '',
'url': '',
Expand All @@ -225,6 +228,7 @@ def _ingest_html(self, s3_path: str, course_name: str, **kwargs) -> str:
title = title.replace("_", " ")
title = title.replace("/", " ")
title = title.strip()
title = title[37:] # removing the uuid prefix
text = [soup.get_text()]

metadata: List[Dict[str, Any]] = [{
Expand Down Expand Up @@ -306,7 +310,7 @@ def _ingest_single_video(self, s3_path: str, course_name: str, **kwargs) -> str:
metadatas: List[Dict[str, Any]] = [{
'course_name': course_name,
's3_path': s3_path,
'readable_filename': kwargs.get('readable_filename', Path(s3_path).name),
'readable_filename': kwargs.get('readable_filename', Path(s3_path).name[37:]),
'pagenumber': '',
'timestamp': text.index(txt),
'url': '',
Expand All @@ -332,7 +336,7 @@ def _ingest_single_docx(self, s3_path: str, course_name: str, **kwargs) -> str:
metadatas: List[Dict[str, Any]] = [{
'course_name': course_name,
's3_path': s3_path,
'readable_filename': kwargs.get('readable_filename', Path(s3_path).name),
'readable_filename': kwargs.get('readable_filename', Path(s3_path).name[37:]),
'pagenumber': '',
'timestamp': '',
'url': '',
Expand All @@ -359,7 +363,7 @@ def _ingest_single_srt(self, s3_path: str, course_name: str, **kwargs) -> str:
metadatas: List[Dict[str, Any]] = [{
'course_name': course_name,
's3_path': s3_path,
'readable_filename': kwargs.get('readable_filename', Path(s3_path).name),
'readable_filename': kwargs.get('readable_filename', Path(s3_path).name[37:]),
'pagenumber': '',
'timestamp': '',
'url': '',
Expand Down Expand Up @@ -387,7 +391,7 @@ def _ingest_single_excel(self, s3_path: str, course_name: str, **kwargs) -> str:
metadatas: List[Dict[str, Any]] = [{
'course_name': course_name,
's3_path': s3_path,
'readable_filename': kwargs.get('readable_filename', Path(s3_path).name),
'readable_filename': kwargs.get('readable_filename', Path(s3_path).name[37:]),
'pagenumber': '',
'timestamp': '',
'url': '',
Expand Down Expand Up @@ -422,7 +426,7 @@ def _ingest_single_image(self, s3_path: str, course_name: str, **kwargs) -> str:
metadatas: List[Dict[str, Any]] = [{
'course_name': course_name,
's3_path': s3_path,
'readable_filename': kwargs.get('readable_filename', Path(s3_path).name),
'readable_filename': kwargs.get('readable_filename', Path(s3_path).name[37:]),
'pagenumber': '',
'timestamp': '',
'url': '',
Expand All @@ -449,7 +453,7 @@ def _ingest_single_csv(self, s3_path: str, course_name: str, **kwargs) -> str:
metadatas: List[Dict[str, Any]] = [{
'course_name': course_name,
's3_path': s3_path,
'readable_filename': kwargs.get('readable_filename', Path(s3_path).name),
'readable_filename': kwargs.get('readable_filename', Path(s3_path).name[37:]),
'pagenumber': '',
'timestamp': '',
'url': '',
Expand Down Expand Up @@ -500,7 +504,7 @@ def _ingest_single_pdf(self, s3_path: str, course_name: str, **kwargs):

# Extract text
text = page.get_text().encode("utf8").decode("utf8", errors='ignore') # get plain text (is in UTF-8)
pdf_pages_OCRed.append(dict(text=text, page_number=i, readable_filename=Path(s3_path).name))
pdf_pages_OCRed.append(dict(text=text, page_number=i, readable_filename=Path(s3_path).name[37:]))

metadatas: List[Dict[str, Any]] = [
{
Expand All @@ -515,8 +519,9 @@ def _ingest_single_pdf(self, s3_path: str, course_name: str, **kwargs):
]
pdf_texts = [page['text'] for page in pdf_pages_OCRed]

self.split_and_upload(texts=pdf_texts, metadatas=metadatas)
print("Success pdf ingest")
success_or_failure = self.split_and_upload(texts=pdf_texts, metadatas=metadatas)
print("PDF message: ", success_or_failure)
return success_or_failure
except Exception as e:
err = f"❌❌ Error in (PDF ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc()
print(err)
Expand All @@ -543,7 +548,7 @@ def _ingest_single_txt(self, s3_path: str, course_name: str, **kwargs) -> str:
metadatas: List[Dict[str, Any]] = [{
'course_name': course_name,
's3_path': s3_path,
'readable_filename': kwargs.get('readable_filename', Path(s3_path).name),
'readable_filename': kwargs.get('readable_filename', Path(s3_path).name[37:]),
'pagenumber': '',
'timestamp': '',
'url': '',
Expand Down Expand Up @@ -575,7 +580,7 @@ def _ingest_single_ppt(self, s3_path: str, course_name: str, **kwargs) -> str:
metadatas: List[Dict[str, Any]] = [{
'course_name': course_name,
's3_path': s3_path,
'readable_filename': kwargs.get('readable_filename', Path(s3_path).name),
'readable_filename': kwargs.get('readable_filename', Path(s3_path).name[37:]),
'pagenumber': '',
'timestamp': '',
'url': '',
Expand Down Expand Up @@ -709,8 +714,8 @@ def split_and_upload(self, texts: List[str], metadatas: List[Dict[str, Any]]):
metadatas (List[Dict[str, Any]]): _description_
"""
print("In split and upload")
print(f"metadatas: {metadatas}")
print(f"Texts: {texts}")
# print(f"metadatas: {metadatas}")
# print(f"Texts: {texts}")
assert len(texts) == len(metadatas), f'must have equal number of text strings and metadata dicts. len(texts) is {len(texts)}. len(metadatas) is {len(metadatas)}'

try:
Expand All @@ -721,6 +726,16 @@ def split_and_upload(self, texts: List[str], metadatas: List[Dict[str, Any]]):
)
contexts: List[Document] = text_splitter.create_documents(texts=texts, metadatas=metadatas)
input_texts = [{'input': context.page_content, 'model': 'text-embedding-ada-002'} for context in contexts]
print("METADATAS: ", metadatas)

# check for duplicates
is_duplicate = self.check_for_duplicates(input_texts, metadatas)
print("is_duplicate: ", is_duplicate)
if is_duplicate:
print("split_and_upload returning duplicate")
return "🚫🚫 Duplicate, ingest skipped.🚫🚫"

print("split_and_upload continuing...")

# adding chunk index to metadata for parent doc retrieval
for i, context in enumerate(contexts):
Expand Down Expand Up @@ -1201,6 +1216,73 @@ def format_for_json(self, found_docs: List[Document]) -> List[Dict]:

return contexts


def check_for_duplicates(self, texts: List[Dict], metadatas: List[Dict[str, Any]]) -> bool:
"""
For given metadata, fetch docs from Supabase based on S3 path or URL.
If docs exists, concatenate the texts and compare with current texts, if same, return True.
"""
print("in check_for_duplicates")

doc_table = os.getenv('NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE')
course_name = metadatas[0]['course_name']
incoming_s3_path = metadatas[0]['s3_path']
url = metadatas[0]['url']
original_filename = incoming_s3_path.split('/')[-1][37:] # remove the 37-char uuid prefix
KastanDay marked this conversation as resolved.
Show resolved Hide resolved
print("Extracted filename from incoming s3_path: ", original_filename)

# check if uuid exists in s3_path
KastanDay marked this conversation as resolved.
Show resolved Hide resolved
incoming_filename = incoming_s3_path.split('/')[-1]
pattern = re.compile(r'[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}', re.I)
if bool(pattern.search(incoming_filename)): # uuid pattern exists
# remove the uuid and proceed with duplicate checking
original_filename = incoming_filename[37:]
else:
# do not remove anything and proceed with duplicate checking
original_filename = incoming_filename

if incoming_s3_path:
filename = incoming_s3_path
supabase_contents = self.supabase_client.table(doc_table).select('id', 'contexts', 's3_path').eq('course_name', course_name).like('s3_path', '%' + original_filename + '%').order('id', desc=True).execute()
elif url:
filename = url
supabase_contents = self.supabase_client.table(doc_table).select('id', 'contexts', 's3_path').eq('course_name', course_name).eq('url', url).order('id', desc=True).execute()
else:
filename = None
supabase_contents = []

supabase_whole_text = ""
print("no. of docs previously present: ", len(supabase_contents.data))

if len(supabase_contents.data) > 0: # if a doc with same filename exists in Supabase
# concatenate texts
supabase_contexts = supabase_contents.data[0]
for text in supabase_contexts['contexts']:
supabase_whole_text += text['text']

current_whole_text = ""
for text in texts:
current_whole_text += text['input']

# compare with current texts
if supabase_whole_text == current_whole_text: # matches the previous file
print(f"The file 📄: {filename} is a duplicate!")
return True

else: # the file is updated
print(f"The file 📄: {filename} seems to be updated! Deleting the older file...")

# call the delete function on older docs
for content in supabase_contents.data:
print("older s3_path to be deleted: ", content['s3_path'])
delete_status = self.delete_data(course_name, content['s3_path'], '')
print("delete_status: ", delete_status)
return False

else: # filename does not already exist in Supabase, so its a brand new file
print(f"File 📄: {filename} is NOT a duplicate!")
return False



if __name__ == '__main__':
pass
3 changes: 3 additions & 0 deletions ai_ta_backend/web_scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import re
import shutil
import time
import uuid
from collections import Counter
from tempfile import NamedTemporaryFile
from zipfile import ZipFile
Expand Down Expand Up @@ -199,6 +200,8 @@ def ingest_file(self, key, course_name, path_name, base_url):
print("Writing", key[2] ,"to temp file")
temp_file.write(key[1])
temp_file.seek(0)
path_name = str(uuid.uuid4()) + '_' + path_name
print("path name in webscrape: ", path_name)
s3_upload_path = "courses/"+ course_name + "/" + path_name + key[2]
with open(temp_file.name, 'rb') as f:
print("Uploading", key[2] ,"to S3")
Expand Down
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,5 @@ unstructured==0.10.29 # causes huge ~5.3 GB of installs. Probbably from onnx: ht

# Not currently supporting coursera ingest
# cs-dlp @ git+https://github.com/raffaem/[email protected] # previously called coursera-dl

pypdf