Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

File update #99

Merged
merged 47 commits into from
Dec 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
c42f606
added the add_users() for Canvas
star-nox Aug 10, 2023
6854205
added canvas course ingest
star-nox Aug 13, 2023
54e3fb0
updated requirements
star-nox Aug 13, 2023
07238a2
added .md ingest and fixed .py ingest
star-nox Aug 15, 2023
deceb15
deleted test ipynb file
star-nox Aug 15, 2023
27383e1
added nomic viz
star-nox Aug 16, 2023
6f08340
added canvas file update function
Aug 21, 2023
34cbbdc
completed update function
star-nox Aug 25, 2023
efd9048
updated course export to include all contents
star-nox Aug 25, 2023
bf3726b
modified to handle diff file structures of downloaded content
star-nox Aug 25, 2023
93646ac
modified canvas update
Aug 30, 2023
05ab444
modified add_users() and ingest_course_content() functions
Sep 21, 2023
f5655ab
modified ingest function
star-nox Sep 21, 2023
6f80b96
modified update_files() for file replacement
star-nox Sep 22, 2023
0223a22
removed the extra os.remove()
star-nox Sep 22, 2023
2e10cc8
fix underscore to dash in for pip
KastanDay Sep 29, 2023
a38fb90
removed json import and added abort to canvas functions
star-nox Oct 2, 2023
79142c5
Merge branch 'main' into canvas
star-nox Oct 2, 2023
118b725
created separate PR for file update
star-nox Oct 2, 2023
35a50a8
added file-update logic in ingest, WIP
star-nox Oct 11, 2023
8499603
removed irrelevant text files
star-nox Oct 11, 2023
4319578
modified pdf ingest function
star-nox Oct 19, 2023
0daac23
fixed PDF duplicate issue
star-nox Oct 20, 2023
dd05d51
removed unwanted files
star-nox Oct 20, 2023
c92aea2
updated nomic version in requirements.txt
star-nox Nov 6, 2023
e11fc6e
Merge branch 'main' of https://github.com/UIUC-Chatbot/ai-ta-backend
star-nox Nov 6, 2023
c01d1bc
Merge branch 'main' of https://github.com/UIUC-Chatbot/ai-ta-backend
star-nox Nov 8, 2023
31002ed
modified s3_paths
star-nox Nov 15, 2023
21f64fb
Merge branch 'main' into file-update
star-nox Nov 15, 2023
0a0e870
testing unique filenames in aws upload
star-nox Nov 16, 2023
bcefb36
added missing library to requirements.txt
star-nox Nov 16, 2023
3bda544
finished check_for_duplicates()
star-nox Nov 16, 2023
b63ca84
fixed filename errors
star-nox Nov 16, 2023
273d598
Merge branch 'main' into file-update
star-nox Nov 16, 2023
a1e0f4b
minor corrections
star-nox Nov 16, 2023
290c616
added a uuid check in check_for_duplicates()
star-nox Nov 20, 2023
7a5cc3a
Merge branch 'main' into file-update
star-nox Nov 21, 2023
bd73036
regex depends on this being a dash
KastanDay Dec 11, 2023
2a6f4b2
regex depends on this being a dash
KastanDay Dec 11, 2023
a1b4127
Fix bug when no duplicate exists.
KastanDay Dec 12, 2023
e01ee11
cleaning up prints, testing looks good. ready to merge
KastanDay Dec 12, 2023
154d45b
Further print and logging refinement
KastanDay Dec 12, 2023
f7ee763
Remove s3 pased method for de-duplication, use Supabase only
KastanDay Dec 12, 2023
2b43ab0
remove duplicate imports
KastanDay Dec 12, 2023
36145d3
remove new requirement
KastanDay Dec 12, 2023
b76b449
Final print cleanups
KastanDay Dec 12, 2023
c42ff61
remove pypdf import
KastanDay Dec 12, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions ai_ta_backend/aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from multiprocessing import Lock, cpu_count
from multiprocessing.pool import ThreadPool
from typing import List, Optional

import uuid
import boto3


Expand Down Expand Up @@ -38,7 +38,12 @@ def upload_data_files_to_s3(course_name: str, localdir: str) -> Optional[List[st
s3_paths_lock = Lock()

def upload(myfile):
s3_file = f"courses/{course_name}/{os.path.basename(myfile)}"
# get the last part of the path and append unique ID before it
directory, old_filename = os.path.split(myfile)
new_filename = str(uuid.uuid4()) + '-' + old_filename
new_filepath = os.path.join(directory, new_filename)

s3_file = f"courses/{course_name}/{os.path.basename(new_filepath)}"
s3.upload_file(myfile, os.getenv('S3_BUCKET_NAME'), s3_file)
with s3_paths_lock:
s3_paths.append(s3_file)
Expand Down
102 changes: 86 additions & 16 deletions ai_ta_backend/vector_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import time
import traceback
import uuid
import re
from importlib import metadata
from pathlib import Path
from tempfile import NamedTemporaryFile
Expand Down Expand Up @@ -167,7 +168,7 @@ def _ingest_single_py(self, s3_path: str, course_name: str, **kwargs):
metadatas: List[Dict[str, Any]] = [{
'course_name': course_name,
's3_path': s3_path,
'readable_filename': kwargs.get('readable_filename', Path(s3_path).name),
'readable_filename': kwargs.get('readable_filename', Path(s3_path).name[37:]),
'pagenumber': '',
'timestamp': '',
'url': '',
Expand All @@ -177,6 +178,7 @@ def _ingest_single_py(self, s3_path: str, course_name: str, **kwargs):
os.remove(file_path)

success_or_failure = self.split_and_upload(texts=texts, metadatas=metadatas)
print("Python ingest: ", success_or_failure)
return success_or_failure

except Exception as e:
Expand All @@ -199,7 +201,7 @@ def _ingest_single_vtt(self, s3_path: str, course_name: str, **kwargs):
metadatas: List[Dict[str, Any]] = [{
'course_name': course_name,
's3_path': s3_path,
'readable_filename': kwargs.get('readable_filename', Path(s3_path).name),
'readable_filename': kwargs.get('readable_filename', Path(s3_path).name[37:]),
'pagenumber': '',
'timestamp': '',
'url': '',
Expand All @@ -225,6 +227,7 @@ def _ingest_html(self, s3_path: str, course_name: str, **kwargs) -> str:
title = title.replace("_", " ")
title = title.replace("/", " ")
title = title.strip()
title = title[37:] # removing the uuid prefix
text = [soup.get_text()]

metadata: List[Dict[str, Any]] = [{
Expand Down Expand Up @@ -306,7 +309,7 @@ def _ingest_single_video(self, s3_path: str, course_name: str, **kwargs) -> str:
metadatas: List[Dict[str, Any]] = [{
'course_name': course_name,
's3_path': s3_path,
'readable_filename': kwargs.get('readable_filename', Path(s3_path).name),
'readable_filename': kwargs.get('readable_filename', Path(s3_path).name[37:]),
'pagenumber': '',
'timestamp': text.index(txt),
'url': '',
Expand All @@ -332,7 +335,7 @@ def _ingest_single_docx(self, s3_path: str, course_name: str, **kwargs) -> str:
metadatas: List[Dict[str, Any]] = [{
'course_name': course_name,
's3_path': s3_path,
'readable_filename': kwargs.get('readable_filename', Path(s3_path).name),
'readable_filename': kwargs.get('readable_filename', Path(s3_path).name[37:]),
'pagenumber': '',
'timestamp': '',
'url': '',
Expand All @@ -359,7 +362,7 @@ def _ingest_single_srt(self, s3_path: str, course_name: str, **kwargs) -> str:
metadatas: List[Dict[str, Any]] = [{
'course_name': course_name,
's3_path': s3_path,
'readable_filename': kwargs.get('readable_filename', Path(s3_path).name),
'readable_filename': kwargs.get('readable_filename', Path(s3_path).name[37:]),
'pagenumber': '',
'timestamp': '',
'url': '',
Expand Down Expand Up @@ -387,7 +390,7 @@ def _ingest_single_excel(self, s3_path: str, course_name: str, **kwargs) -> str:
metadatas: List[Dict[str, Any]] = [{
'course_name': course_name,
's3_path': s3_path,
'readable_filename': kwargs.get('readable_filename', Path(s3_path).name),
'readable_filename': kwargs.get('readable_filename', Path(s3_path).name[37:]),
'pagenumber': '',
'timestamp': '',
'url': '',
Expand Down Expand Up @@ -422,7 +425,7 @@ def _ingest_single_image(self, s3_path: str, course_name: str, **kwargs) -> str:
metadatas: List[Dict[str, Any]] = [{
'course_name': course_name,
's3_path': s3_path,
'readable_filename': kwargs.get('readable_filename', Path(s3_path).name),
'readable_filename': kwargs.get('readable_filename', Path(s3_path).name[37:]),
'pagenumber': '',
'timestamp': '',
'url': '',
Expand All @@ -449,7 +452,7 @@ def _ingest_single_csv(self, s3_path: str, course_name: str, **kwargs) -> str:
metadatas: List[Dict[str, Any]] = [{
'course_name': course_name,
's3_path': s3_path,
'readable_filename': kwargs.get('readable_filename', Path(s3_path).name),
'readable_filename': kwargs.get('readable_filename', Path(s3_path).name[37:]),
'pagenumber': '',
'timestamp': '',
'url': '',
Expand Down Expand Up @@ -500,7 +503,7 @@ def _ingest_single_pdf(self, s3_path: str, course_name: str, **kwargs):

# Extract text
text = page.get_text().encode("utf8").decode("utf8", errors='ignore') # get plain text (is in UTF-8)
pdf_pages_OCRed.append(dict(text=text, page_number=i, readable_filename=Path(s3_path).name))
pdf_pages_OCRed.append(dict(text=text, page_number=i, readable_filename=Path(s3_path).name[37:]))

metadatas: List[Dict[str, Any]] = [
{
Expand All @@ -515,10 +518,10 @@ def _ingest_single_pdf(self, s3_path: str, course_name: str, **kwargs):
]
pdf_texts = [page['text'] for page in pdf_pages_OCRed]

self.split_and_upload(texts=pdf_texts, metadatas=metadatas)
print("Success pdf ingest")
success_or_failure = self.split_and_upload(texts=pdf_texts, metadatas=metadatas)
return success_or_failure
except Exception as e:
err = f"❌❌ Error in (PDF ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc()
err = f"❌❌ Error in (PDF ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc() # type: ignore
print(err)
return err
return "Success"
Expand All @@ -543,7 +546,7 @@ def _ingest_single_txt(self, s3_path: str, course_name: str, **kwargs) -> str:
metadatas: List[Dict[str, Any]] = [{
'course_name': course_name,
's3_path': s3_path,
'readable_filename': kwargs.get('readable_filename', Path(s3_path).name),
'readable_filename': kwargs.get('readable_filename', Path(s3_path).name[37:]),
'pagenumber': '',
'timestamp': '',
'url': '',
Expand Down Expand Up @@ -575,7 +578,7 @@ def _ingest_single_ppt(self, s3_path: str, course_name: str, **kwargs) -> str:
metadatas: List[Dict[str, Any]] = [{
'course_name': course_name,
's3_path': s3_path,
'readable_filename': kwargs.get('readable_filename', Path(s3_path).name),
'readable_filename': kwargs.get('readable_filename', Path(s3_path).name[37:]),
'pagenumber': '',
'timestamp': '',
'url': '',
Expand Down Expand Up @@ -722,6 +725,11 @@ def split_and_upload(self, texts: List[str], metadatas: List[Dict[str, Any]]):
contexts: List[Document] = text_splitter.create_documents(texts=texts, metadatas=metadatas)
input_texts = [{'input': context.page_content, 'model': 'text-embedding-ada-002'} for context in contexts]

# check for duplicates
is_duplicate = self.check_for_duplicates(input_texts, metadatas)
if is_duplicate:
return "Success"

# adding chunk index to metadata for parent doc retrieval
for i, context in enumerate(contexts):
context.metadata['chunk_index'] = i
Expand Down Expand Up @@ -1087,7 +1095,7 @@ def get_context_stuffed_prompt(self, user_question: str, course_name: str, top_n
summary = f"\nSummary: {text}"
all_texts += doc + summary + '\n' + separator + '\n'

stuffed_prompt = f"""Please answer the following question.
stuffed_prompt = """Please answer the following question.
Use the context below, called 'your documents', only if it's helpful and don't use parts that are very irrelevant.
It's good to quote 'your documents' directly using informal citations, like "in document X it says Y". Try to avoid giving false or misleading information. Feel free to say you don't know.
Try to be helpful, polite, honest, sophisticated, emotionally aware, and humble-but-knowledgeable.
Expand Down Expand Up @@ -1201,6 +1209,68 @@ def format_for_json(self, found_docs: List[Document]) -> List[Dict]:

return contexts


def check_for_duplicates(self, texts: List[Dict], metadatas: List[Dict[str, Any]]) -> bool:
"""
For given metadata, fetch docs from Supabase based on S3 path or URL.
If docs exists, concatenate the texts and compare with current texts, if same, return True.
"""
doc_table = os.getenv('NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE', '')
course_name = metadatas[0]['course_name']
incoming_s3_path = metadatas[0]['s3_path']
url = metadatas[0]['url']
original_filename = incoming_s3_path.split('/')[-1][37:] # remove the 37-char uuid prefix
KastanDay marked this conversation as resolved.
Show resolved Hide resolved

# check if uuid exists in s3_path -- not all s3_paths have uuids!
incoming_filename = incoming_s3_path.split('/')[-1]
pattern = re.compile(r'[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}', re.I) # uuid V4 pattern, and v4 only.
if bool(pattern.search(incoming_filename)):
# uuid pattern exists -- remove the uuid and proceed with duplicate checking
original_filename = incoming_filename[37:]
else:
# do not remove anything and proceed with duplicate checking
original_filename = incoming_filename

if incoming_s3_path:
filename = incoming_s3_path
supabase_contents = self.supabase_client.table(doc_table).select('id', 'contexts', 's3_path').eq('course_name', course_name).like('s3_path', '%' + original_filename + '%').order('id', desc=True).execute()
supabase_contents = supabase_contents.data
elif url:
filename = url
supabase_contents = self.supabase_client.table(doc_table).select('id', 'contexts', 's3_path').eq('course_name', course_name).eq('url', url).order('id', desc=True).execute()
supabase_contents = supabase_contents.data
else:
filename = None
supabase_contents = []

supabase_whole_text = ""
if len(supabase_contents) > 0: # if a doc with same filename exists in Supabase
# concatenate texts
supabase_contexts = supabase_contents[0]
for text in supabase_contexts['contexts']:
supabase_whole_text += text['text']

current_whole_text = ""
for text in texts:
current_whole_text += text['input']

if supabase_whole_text == current_whole_text: # matches the previous file
print(f"Duplicate ingested! 📄 s3_path: {filename}.")
return True

else: # the file is updated
print(f"Updated file detected! Same filename, new contents. 📄 s3_path: {filename}")

# call the delete function on older docs
for content in supabase_contents:
print("older s3_path to be deleted: ", content['s3_path'])
delete_status = self.delete_data(course_name, content['s3_path'], '')
print("delete_status: ", delete_status)
return False

else: # filename does not already exist in Supabase, so its a brand new file
print(f"NOT a duplicate! 📄s3_path: {filename}")
return False


if __name__ == '__main__':
pass
3 changes: 3 additions & 0 deletions ai_ta_backend/web_scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import re
import shutil
import time
import uuid
from collections import Counter
from tempfile import NamedTemporaryFile
from zipfile import ZipFile
Expand Down Expand Up @@ -199,6 +200,8 @@ def ingest_file(self, key, course_name, path_name, base_url):
print("Writing", key[2] ,"to temp file")
temp_file.write(key[1])
temp_file.seek(0)
path_name = str(uuid.uuid4()) + '-' + path_name
print("path name in webscrape: ", path_name)
s3_upload_path = "courses/"+ course_name + "/" + path_name + key[2]
with open(temp_file.name, 'rb') as f:
print("Uploading", key[2] ,"to S3")
Expand Down