Skip to content

Commit

Permalink
cleaning up prints, testing looks good. ready to merge
Browse files Browse the repository at this point in the history
  • Loading branch information
KastanDay committed Dec 12, 2023
1 parent a1b4127 commit e01ee11
Showing 1 changed file with 7 additions and 16 deletions.
23 changes: 7 additions & 16 deletions ai_ta_backend/vector_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -730,12 +730,8 @@ def split_and_upload(self, texts: List[str], metadatas: List[Dict[str, Any]]):

# check for duplicates
is_duplicate = self.check_for_duplicates(input_texts, metadatas)
print("is_duplicate: ", is_duplicate)
if is_duplicate:
print("split_and_upload returning duplicate")
return "🚫🚫 Duplicate, ingest skipped.🚫🚫"

print("split_and_upload continuing...")
return "Success"

# adding chunk index to metadata for parent doc retrieval
for i, context in enumerate(contexts):
Expand Down Expand Up @@ -1221,20 +1217,18 @@ def check_for_duplicates(self, texts: List[Dict], metadatas: List[Dict[str, Any]
For given metadata, fetch docs from Supabase based on S3 path or URL.
If docs exists, concatenate the texts and compare with current texts, if same, return True.
"""
print("in check_for_duplicates")

doc_table = os.getenv('NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE')
doc_table = os.getenv('NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE', '')
course_name = metadatas[0]['course_name']
incoming_s3_path = metadatas[0]['s3_path']
url = metadatas[0]['url']
original_filename = incoming_s3_path.split('/')[-1][37:] # remove the 37-char uuid prefix
print("Extracted filename from incoming s3_path: ", original_filename)

# check if uuid exists in s3_path
# check if uuid exists in s3_path -- not all s3_paths have uuids!
incoming_filename = incoming_s3_path.split('/')[-1]
pattern = re.compile(r'[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}', re.I) # uuid V4 pattern, and v4 only.
if bool(pattern.search(incoming_filename)): # uuid pattern exists
# remove the uuid and proceed with duplicate checking
if bool(pattern.search(incoming_filename)):
# uuid pattern exists -- remove the uuid and proceed with duplicate checking
original_filename = incoming_filename[37:]
else:
# do not remove anything and proceed with duplicate checking
Expand All @@ -1253,8 +1247,6 @@ def check_for_duplicates(self, texts: List[Dict], metadatas: List[Dict[str, Any]
supabase_contents = []

supabase_whole_text = ""
print("no. of docs previously present: ", len(supabase_contents))

if len(supabase_contents) > 0: # if a doc with same filename exists in Supabase
# concatenate texts
supabase_contexts = supabase_contents[0]
Expand All @@ -1265,8 +1257,8 @@ def check_for_duplicates(self, texts: List[Dict], metadatas: List[Dict[str, Any]
for text in texts:
current_whole_text += text['input']

print("supabase_whole_text: ", supabase_whole_text)
print("current_whole_text: ", current_whole_text)
# print("supabase_whole_text: ", supabase_whole_text)
# print("current_whole_text: ", current_whole_text)

# compare with current texts
if supabase_whole_text == current_whole_text: # matches the previous file
Expand All @@ -1288,6 +1280,5 @@ def check_for_duplicates(self, texts: List[Dict], metadatas: List[Dict[str, Any]
return False



if __name__ == '__main__':
pass

0 comments on commit e01ee11

Please sign in to comment.