Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

File update #99

Merged
merged 47 commits into from
Dec 12, 2023
Merged
Changes from 1 commit
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
c42f606
added the add_users() for Canvas
star-nox Aug 10, 2023
6854205
added canvas course ingest
star-nox Aug 13, 2023
54e3fb0
updated requirements
star-nox Aug 13, 2023
07238a2
added .md ingest and fixed .py ingest
star-nox Aug 15, 2023
deceb15
deleted test ipynb file
star-nox Aug 15, 2023
27383e1
added nomic viz
star-nox Aug 16, 2023
6f08340
added canvas file update function
Aug 21, 2023
34cbbdc
completed update function
star-nox Aug 25, 2023
efd9048
updated course export to include all contents
star-nox Aug 25, 2023
bf3726b
modified to handle diff file structures of downloaded content
star-nox Aug 25, 2023
93646ac
modified canvas update
Aug 30, 2023
05ab444
modified add_users() and ingest_course_content() functions
Sep 21, 2023
f5655ab
modified ingest function
star-nox Sep 21, 2023
6f80b96
modified update_files() for file replacement
star-nox Sep 22, 2023
0223a22
removed the extra os.remove()
star-nox Sep 22, 2023
2e10cc8
fix underscore to dash in for pip
KastanDay Sep 29, 2023
a38fb90
removed json import and added abort to canvas functions
star-nox Oct 2, 2023
79142c5
Merge branch 'main' into canvas
star-nox Oct 2, 2023
118b725
created separate PR for file update
star-nox Oct 2, 2023
35a50a8
added file-update logic in ingest, WIP
star-nox Oct 11, 2023
8499603
removed irrelevant text files
star-nox Oct 11, 2023
4319578
modified pdf ingest function
star-nox Oct 19, 2023
0daac23
fixed PDF duplicate issue
star-nox Oct 20, 2023
dd05d51
removed unwanted files
star-nox Oct 20, 2023
c92aea2
updated nomic version in requirements.txt
star-nox Nov 6, 2023
e11fc6e
Merge branch 'main' of https://github.com/UIUC-Chatbot/ai-ta-backend
star-nox Nov 6, 2023
c01d1bc
Merge branch 'main' of https://github.com/UIUC-Chatbot/ai-ta-backend
star-nox Nov 8, 2023
31002ed
modified s3_paths
star-nox Nov 15, 2023
21f64fb
Merge branch 'main' into file-update
star-nox Nov 15, 2023
0a0e870
testing unique filenames in aws upload
star-nox Nov 16, 2023
bcefb36
added missing library to requirements.txt
star-nox Nov 16, 2023
3bda544
finished check_for_duplicates()
star-nox Nov 16, 2023
b63ca84
fixed filename errors
star-nox Nov 16, 2023
273d598
Merge branch 'main' into file-update
star-nox Nov 16, 2023
a1e0f4b
minor corrections
star-nox Nov 16, 2023
290c616
added a uuid check in check_for_duplicates()
star-nox Nov 20, 2023
7a5cc3a
Merge branch 'main' into file-update
star-nox Nov 21, 2023
bd73036
regex depends on this being a dash
KastanDay Dec 11, 2023
2a6f4b2
regex depends on this being a dash
KastanDay Dec 11, 2023
a1b4127
Fix bug when no duplicate exists.
KastanDay Dec 12, 2023
e01ee11
cleaning up prints, testing looks good. ready to merge
KastanDay Dec 12, 2023
154d45b
Further print and logging refinement
KastanDay Dec 12, 2023
f7ee763
Remove s3 pased method for de-duplication, use Supabase only
KastanDay Dec 12, 2023
2b43ab0
remove duplicate imports
KastanDay Dec 12, 2023
36145d3
remove new requirement
KastanDay Dec 12, 2023
b76b449
Final print cleanups
KastanDay Dec 12, 2023
c42ff61
remove pypdf import
KastanDay Dec 12, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Fix bug when no duplicate exists.
KastanDay committed Dec 12, 2023
commit a1b4127bcf1a1fe0289bb18dd4bf6b478623b2f4
19 changes: 12 additions & 7 deletions ai_ta_backend/vector_database.py
Original file line number Diff line number Diff line change
@@ -1102,7 +1102,7 @@ def get_context_stuffed_prompt(self, user_question: str, course_name: str, top_n
summary = f"\nSummary: {text}"
all_texts += doc + summary + '\n' + separator + '\n'

stuffed_prompt = f"""Please answer the following question.
stuffed_prompt = """Please answer the following question.
Use the context below, called 'your documents', only if it's helpful and don't use parts that are very irrelevant.
It's good to quote 'your documents' directly using informal citations, like "in document X it says Y". Try to avoid giving false or misleading information. Feel free to say you don't know.
Try to be helpful, polite, honest, sophisticated, emotionally aware, and humble-but-knowledgeable.
@@ -1232,7 +1232,7 @@ def check_for_duplicates(self, texts: List[Dict], metadatas: List[Dict[str, Any]

# check if uuid exists in s3_path
KastanDay marked this conversation as resolved.
Show resolved Hide resolved
incoming_filename = incoming_s3_path.split('/')[-1]
pattern = re.compile(r'[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}', re.I)
pattern = re.compile(r'[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}', re.I) # uuid V4 pattern, and v4 only.
if bool(pattern.search(incoming_filename)): # uuid pattern exists
# remove the uuid and proceed with duplicate checking
original_filename = incoming_filename[37:]
@@ -1243,26 +1243,31 @@ def check_for_duplicates(self, texts: List[Dict], metadatas: List[Dict[str, Any]
if incoming_s3_path:
filename = incoming_s3_path
supabase_contents = self.supabase_client.table(doc_table).select('id', 'contexts', 's3_path').eq('course_name', course_name).like('s3_path', '%' + original_filename + '%').order('id', desc=True).execute()
supabase_contents = supabase_contents.data
elif url:
filename = url
supabase_contents = self.supabase_client.table(doc_table).select('id', 'contexts', 's3_path').eq('course_name', course_name).eq('url', url).order('id', desc=True).execute()
supabase_contents = supabase_contents.data
else:
filename = None
supabase_contents = []

supabase_whole_text = ""
print("no. of docs previously present: ", len(supabase_contents.data))
print("no. of docs previously present: ", len(supabase_contents))

if len(supabase_contents.data) > 0: # if a doc with same filename exists in Supabase
if len(supabase_contents) > 0: # if a doc with same filename exists in Supabase
# concatenate texts
supabase_contexts = supabase_contents.data[0]
supabase_contexts = supabase_contents[0]
for text in supabase_contexts['contexts']:
supabase_whole_text += text['text']

current_whole_text = ""
for text in texts:
current_whole_text += text['input']

print("supabase_whole_text: ", supabase_whole_text)
print("current_whole_text: ", current_whole_text)

# compare with current texts
if supabase_whole_text == current_whole_text: # matches the previous file
print(f"The file 📄: {filename} is a duplicate!")
@@ -1272,7 +1277,7 @@ def check_for_duplicates(self, texts: List[Dict], metadatas: List[Dict[str, Any]
print(f"The file 📄: {filename} seems to be updated! Deleting the older file...")

# call the delete function on older docs
for content in supabase_contents.data:
for content in supabase_contents:
print("older s3_path to be deleted: ", content['s3_path'])
delete_status = self.delete_data(course_name, content['s3_path'], '')
print("delete_status: ", delete_status)
@@ -1281,7 +1286,7 @@ def check_for_duplicates(self, texts: List[Dict], metadatas: List[Dict[str, Any]
else: # filename does not already exist in Supabase, so its a brand new file
print(f"File 📄: {filename} is NOT a duplicate!")
return False



if __name__ == '__main__':