Skip to content

Commit

Permalink
added a uuid check in check_for_duplicates()
Browse files Browse the repository at this point in the history
  • Loading branch information
star-nox committed Nov 20, 2023
1 parent a1e0f4b commit 290c616
Showing 1 changed file with 11 additions and 1 deletion.
12 changes: 11 additions & 1 deletion ai_ta_backend/vector_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -1228,7 +1228,17 @@ def check_for_duplicates(self, texts: List[Dict], metadatas: List[Dict[str, Any]
incoming_s3_path = metadatas[0]['s3_path']
url = metadatas[0]['url']
original_filename = incoming_s3_path.split('/')[-1][37:] # remove the 37-char uuid prefix
print("original_filename: ", original_filename)
print("Extracted filename from incoming s3_path: ", original_filename)

# check if uuid exists in s3_path
incoming_filename = incoming_s3_path.split('/')[-1]
pattern = re.compile(r'[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}', re.I)
if bool(pattern.search(incoming_filename)): # uuid pattern exists
# remove the uuid and proceed with duplicate checking
original_filename = incoming_filename[37:]
else:
# do not remove anything and proceed with duplicate checking
original_filename = incoming_filename

if incoming_s3_path:
filename = incoming_s3_path
Expand Down

0 comments on commit 290c616

Please sign in to comment.