Skip to content

Commit

Permalink
finished check_for_duplicates()
Browse files Browse the repository at this point in the history
  • Loading branch information
star-nox committed Nov 16, 2023
1 parent bcefb36 commit 3bda544
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 31 deletions.
11 changes: 7 additions & 4 deletions ai_ta_backend/aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from multiprocessing import Lock, cpu_count
from multiprocessing.pool import ThreadPool
from typing import List, Optional

import uuid
import boto3


Expand Down Expand Up @@ -38,9 +38,12 @@ def upload_data_files_to_s3(course_name: str, localdir: str) -> Optional[List[st
s3_paths_lock = Lock()

def upload(myfile):
print("filename: ", myfile)
exit()
s3_file = f"courses/{course_name}/{os.path.basename(myfile)}"
# get the last part of the path and append unique ID before it
directory, old_filename = os.path.split(myfile)
new_filename = str(uuid.uuid4()) + '_' + old_filename
new_filepath = os.path.join(directory, new_filename)

s3_file = f"courses/{course_name}/{os.path.basename(new_filepath)}"
s3.upload_file(myfile, os.getenv('S3_BUCKET_NAME'), s3_file)
with s3_paths_lock:
s3_paths.append(s3_file)
Expand Down
50 changes: 23 additions & 27 deletions ai_ta_backend/vector_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,21 +223,20 @@ def _ingest_html(self, s3_path: str, course_name: str, **kwargs) -> str:
title = title.replace("_", " ")
title = title.replace("/", " ")
title = title.strip()

if kwargs['kwargs'] == {}:
print("KWARGS: ", kwargs)
if kwargs == {}:
url = ''
base_url = ''
else:
if 'url' in kwargs['kwargs'].keys():
url = kwargs['kwargs']['url']
if 'url' in kwargs.keys():
url = kwargs['url']
else:
url = ''
if 'base_url' in kwargs['kwargs'].keys():
base_url = kwargs['kwargs']['base_url']
if 'base_url' in kwargs.keys():
base_url = kwargs['base_url']
else:
base_url = ''


text = [soup.get_text()]

metadata: List[Dict[str, Any]] = [{
Expand Down Expand Up @@ -744,7 +743,8 @@ def split_and_upload(self, texts: List[str], metadatas: List[Dict[str, Any]]):
)
contexts: List[Document] = text_splitter.create_documents(texts=texts, metadatas=metadatas)
input_texts = [{'input': context.page_content, 'model': 'text-embedding-ada-002'} for context in contexts]

print("METADATAS: ", metadatas)

# check for duplicates
is_duplicate = self.check_for_duplicates(input_texts, metadatas)
print("is_duplicate: ", is_duplicate)
Expand Down Expand Up @@ -1244,34 +1244,28 @@ def check_for_duplicates(self, texts: List[Dict], metadatas: List[Dict[str, Any]
course_name = metadatas[0]['course_name']
incoming_s3_path = metadatas[0]['s3_path']
url = metadatas[0]['url']
filename = metadatas[0]['readable_filename']
incoming_filename = metadatas[0]['readable_filename'] # incoming filename should be equal to old filename

original_filename = incoming_s3_path.split('/')[-1][37:] # remove the 37-char uuid prefix
original_s3_path = "courses/" + course_name + "/" + original_filename # the older files will have this path


print("--------------------Checking for duplicates------------------------")
print("METADATAS: ", metadatas)
print("S3_PATH: ", incoming_s3_path)
print("filename: ", filename)
print("OG S3 PATH: ", original_s3_path)
exit()
# print("METADATAS: ", metadatas)
# print("S3_PATH: ", incoming_s3_path)
# print("filename: ", incoming_filename)


if s3_path:
filename = original_s3_path
supabase_contents = self.supabase_client.table(doc_table).select('contexts', 's3_path').eq('course_name', course_name).like('s3_path', '%' + original_s3_path + '%').execute()
if incoming_s3_path:
filename = incoming_s3_path
supabase_contents = self.supabase_client.table(doc_table).select('id', 'contexts', 's3_path').eq('course_name', course_name).like('s3_path', '%' + original_filename + '%').order('id', desc=True).execute()
elif url:
filename = url
supabase_contents = self.supabase_client.table(doc_table).select('contexts', 's3_path').eq('course_name', course_name).eq('url', url).execute()
supabase_contents = self.supabase_client.table(doc_table).select('id', 'contexts', 's3_path').eq('course_name', course_name).eq('url', url).order('id', desc=True).execute()
else:
filename = None
supabase_contents = []

supabase_whole_text = ""
# printing older s3_paths
for content in supabase_contents.data:
print(content['s3_path'])

print("no. of docs previously present: ", len(supabase_contents.data))

if len(supabase_contents.data) > 0: # if a doc with same filename exists in Supabase
# concatenate texts
Expand All @@ -1289,13 +1283,15 @@ def check_for_duplicates(self, texts: List[Dict], metadatas: List[Dict[str, Any]
print(f"The file πŸ“„: {filename} is a duplicate!")
return True
else: # the file is updated
print(f"The file πŸ“„: {filename} seems to be updated! Deleting the older data...")
# call the delete function
print(f"The file πŸ“„: {filename} seems to be updated! Deleting the older file...")

# call the delete function on older docs - ideally should only be 1
for content in supabase_contents.data:
print("content: ", content['s3_path'])
print("older s3_path to be deleted: ", content['s3_path'])
delete_status = self.delete_data(course_name, content['s3_path'], '')
print("delete_status: ", delete_status)
return False

else: # filename does not already exist in Supabase, so its a brand new file
print(f"File πŸ“„: {filename} is NOT a duplicate!")
return False
Expand Down

0 comments on commit 3bda544

Please sign in to comment.