Skip to content

Commit

Permalink
Fix bug when no duplicate exists.
Browse files Browse the repository at this point in the history
  • Loading branch information
KastanDay committed Dec 12, 2023
1 parent 2a6f4b2 commit a1b4127
Showing 1 changed file with 12 additions and 7 deletions.
19 changes: 12 additions & 7 deletions ai_ta_backend/vector_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -1102,7 +1102,7 @@ def get_context_stuffed_prompt(self, user_question: str, course_name: str, top_n
summary = f"\nSummary: {text}"
all_texts += doc + summary + '\n' + separator + '\n'

stuffed_prompt = f"""Please answer the following question.
stuffed_prompt = """Please answer the following question.
Use the context below, called 'your documents', only if it's helpful and don't use parts that are very irrelevant.
It's good to quote 'your documents' directly using informal citations, like "in document X it says Y". Try to avoid giving false or misleading information. Feel free to say you don't know.
Try to be helpful, polite, honest, sophisticated, emotionally aware, and humble-but-knowledgeable.
Expand Down Expand Up @@ -1232,7 +1232,7 @@ def check_for_duplicates(self, texts: List[Dict], metadatas: List[Dict[str, Any]

# check if uuid exists in s3_path
incoming_filename = incoming_s3_path.split('/')[-1]
pattern = re.compile(r'[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}', re.I)
pattern = re.compile(r'[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}', re.I) # uuid V4 pattern, and v4 only.
if bool(pattern.search(incoming_filename)): # uuid pattern exists
# remove the uuid and proceed with duplicate checking
original_filename = incoming_filename[37:]
Expand All @@ -1243,26 +1243,31 @@ def check_for_duplicates(self, texts: List[Dict], metadatas: List[Dict[str, Any]
if incoming_s3_path:
filename = incoming_s3_path
supabase_contents = self.supabase_client.table(doc_table).select('id', 'contexts', 's3_path').eq('course_name', course_name).like('s3_path', '%' + original_filename + '%').order('id', desc=True).execute()
supabase_contents = supabase_contents.data
elif url:
filename = url
supabase_contents = self.supabase_client.table(doc_table).select('id', 'contexts', 's3_path').eq('course_name', course_name).eq('url', url).order('id', desc=True).execute()
supabase_contents = supabase_contents.data
else:
filename = None
supabase_contents = []

supabase_whole_text = ""
print("no. of docs previously present: ", len(supabase_contents.data))
print("no. of docs previously present: ", len(supabase_contents))

if len(supabase_contents.data) > 0: # if a doc with same filename exists in Supabase
if len(supabase_contents) > 0: # if a doc with same filename exists in Supabase
# concatenate texts
supabase_contexts = supabase_contents.data[0]
supabase_contexts = supabase_contents[0]
for text in supabase_contexts['contexts']:
supabase_whole_text += text['text']

current_whole_text = ""
for text in texts:
current_whole_text += text['input']

print("supabase_whole_text: ", supabase_whole_text)
print("current_whole_text: ", current_whole_text)

# compare with current texts
if supabase_whole_text == current_whole_text: # matches the previous file
print(f"The file 📄: {filename} is a duplicate!")
Expand All @@ -1272,7 +1277,7 @@ def check_for_duplicates(self, texts: List[Dict], metadatas: List[Dict[str, Any]
print(f"The file 📄: {filename} seems to be updated! Deleting the older file...")

# call the delete function on older docs
for content in supabase_contents.data:
for content in supabase_contents:
print("older s3_path to be deleted: ", content['s3_path'])
delete_status = self.delete_data(course_name, content['s3_path'], '')
print("delete_status: ", delete_status)
Expand All @@ -1281,7 +1286,7 @@ def check_for_duplicates(self, texts: List[Dict], metadatas: List[Dict[str, Any]
else: # filename does not already exist in Supabase, so its a brand new file
print(f"File 📄: {filename} is NOT a duplicate!")
return False



if __name__ == '__main__':
Expand Down

0 comments on commit a1b4127

Please sign in to comment.