Skip to content

Commit

Permalink
fixed filename errors
Browse files Browse the repository at this point in the history
  • Loading branch information
star-nox committed Nov 16, 2023
1 parent 3bda544 commit b63ca84
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 21 deletions.
36 changes: 15 additions & 21 deletions ai_ta_backend/vector_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ def _ingest_single_py(self, s3_path: str, course_name: str, **kwargs):
metadatas: List[Dict[str, Any]] = [{
'course_name': course_name,
's3_path': s3_path,
'readable_filename': kwargs['readable_filename'] if 'readable_filename' in kwargs.keys() else Path(s3_path).name,
'readable_filename': kwargs['readable_filename'] if 'readable_filename' in kwargs.keys() else Path(s3_path).name[37:],
'pagenumber': '',
'timestamp': '',
'url': '',
Expand Down Expand Up @@ -200,7 +200,7 @@ def _ingest_single_vtt(self, s3_path: str, course_name: str, **kwargs):
metadatas: List[Dict[str, Any]] = [{
'course_name': course_name,
's3_path': s3_path,
'readable_filename': kwargs['readable_filename'] if 'readable_filename' in kwargs.keys() else Path(s3_path).name,
'readable_filename': kwargs['readable_filename'] if 'readable_filename' in kwargs.keys() else Path(s3_path).name[37:],
'pagenumber': '',
'timestamp': '',
'url': '',
Expand All @@ -223,6 +223,7 @@ def _ingest_html(self, s3_path: str, course_name: str, **kwargs) -> str:
title = title.replace("_", " ")
title = title.replace("/", " ")
title = title.strip()
title = title[37:] # remove unqiue ID from the filename
print("KWARGS: ", kwargs)
if kwargs == {}:
url = ''
Expand Down Expand Up @@ -318,7 +319,7 @@ def _ingest_single_video(self, s3_path: str, course_name: str, **kwargs) -> str:
metadatas: List[Dict[str, Any]] = [{
'course_name': course_name,
's3_path': s3_path,
'readable_filename': kwargs['readable_filename'] if 'readable_filename' in kwargs.keys() else Path(s3_path).name,
'readable_filename': kwargs['readable_filename'] if 'readable_filename' in kwargs.keys() else Path(s3_path).name[37:],
'pagenumber': '',
'timestamp': text.index(txt),
'url': '',
Expand All @@ -344,7 +345,7 @@ def _ingest_single_docx(self, s3_path: str, course_name: str, **kwargs) -> str:
metadatas: List[Dict[str, Any]] = [{
'course_name': course_name,
's3_path': s3_path,
'readable_filename': kwargs['readable_filename'] if 'readable_filename' in kwargs.keys() else Path(s3_path).name,
'readable_filename': kwargs['readable_filename'] if 'readable_filename' in kwargs.keys() else Path(s3_path).name[37:],
'pagenumber': '',
'timestamp': '',
'url': '',
Expand All @@ -370,7 +371,7 @@ def _ingest_single_srt(self, s3_path: str, course_name: str, **kwargs) -> str:
metadatas: List[Dict[str, Any]] = [{
'course_name': course_name,
's3_path': s3_path,
'readable_filename': kwargs['readable_filename'] if 'readable_filename' in kwargs.keys() else Path(s3_path).name,
'readable_filename': kwargs['readable_filename'] if 'readable_filename' in kwargs.keys() else Path(s3_path).name[37:],
'pagenumber': '',
'timestamp': '',
'url': '',
Expand All @@ -397,7 +398,7 @@ def _ingest_single_excel(self, s3_path: str, course_name: str, **kwargs) -> str:
metadatas: List[Dict[str, Any]] = [{
'course_name': course_name,
's3_path': s3_path,
'readable_filename': kwargs['readable_filename'] if 'readable_filename' in kwargs.keys() else Path(s3_path).name,
'readable_filename': kwargs['readable_filename'] if 'readable_filename' in kwargs.keys() else Path(s3_path).name[37:],
'pagenumber': '',
'timestamp': '',
'url': '',
Expand Down Expand Up @@ -431,7 +432,7 @@ def _ingest_single_image(self, s3_path: str, course_name: str, **kwargs) -> str:
metadatas: List[Dict[str, Any]] = [{
'course_name': course_name,
's3_path': s3_path,
'readable_filename': kwargs['readable_filename'] if 'readable_filename' in kwargs.keys() else Path(s3_path).name,
'readable_filename': kwargs['readable_filename'] if 'readable_filename' in kwargs.keys() else Path(s3_path).name[37:],
'pagenumber': '',
'timestamp': '',
'url': '',
Expand All @@ -457,7 +458,7 @@ def _ingest_single_csv(self, s3_path: str, course_name: str, **kwargs) -> str:
metadatas: List[Dict[str, Any]] = [{
'course_name': course_name,
's3_path': s3_path,
'readable_filename': kwargs['readable_filename'] if 'readable_filename' in kwargs.keys() else Path(s3_path).name,
'readable_filename': kwargs['readable_filename'] if 'readable_filename' in kwargs.keys() else Path(s3_path).name[37:],
'pagenumber': '',
'timestamp': '',
'url': '',
Expand Down Expand Up @@ -507,7 +508,7 @@ def _ingest_single_pdf(self, s3_path: str, course_name: str, **kwargs):

# Extract text
text = page.get_text().encode("utf8").decode("utf8", errors='ignore') # get plain text (is in UTF-8)
pdf_pages_OCRed.append(dict(text=text, page_number=i, readable_filename=Path(s3_path).name))
pdf_pages_OCRed.append(dict(text=text, page_number=i, readable_filename=Path(s3_path).name[37:]))

# Webscrape kwargs
if 'kwargs' in kwargs.keys() and kwargs['kwargs'] == {}:
Expand Down Expand Up @@ -568,7 +569,7 @@ def _ingest_single_txt(self, s3_path: str, course_name: str, **kwargs) -> str:
metadatas: List[Dict[str, Any]] = [{
'course_name': course_name,
's3_path': s3_path,
'readable_filename': kwargs['readable_filename'] if 'readable_filename' in kwargs.keys() else Path(s3_path).name,
'readable_filename': kwargs['readable_filename'] if 'readable_filename' in kwargs.keys() else Path(s3_path).name[37:],
'pagenumber': '',
'timestamp': '',
'url': '',
Expand Down Expand Up @@ -599,7 +600,7 @@ def _ingest_single_ppt(self, s3_path: str, course_name: str, **kwargs) -> str:
metadatas: List[Dict[str, Any]] = [{
'course_name': course_name,
's3_path': s3_path,
'readable_filename': kwargs['readable_filename'] if 'readable_filename' in kwargs.keys() else Path(s3_path).name,
'readable_filename': kwargs['readable_filename'] if 'readable_filename' in kwargs.keys() else Path(s3_path).name[37:],
'pagenumber': '',
'timestamp': '',
'url': '',
Expand Down Expand Up @@ -1244,15 +1245,8 @@ def check_for_duplicates(self, texts: List[Dict], metadatas: List[Dict[str, Any]
course_name = metadatas[0]['course_name']
incoming_s3_path = metadatas[0]['s3_path']
url = metadatas[0]['url']
incoming_filename = metadatas[0]['readable_filename'] # incoming filename should be equal to old filename

original_filename = incoming_s3_path.split('/')[-1][37:] # remove the 37-char uuid prefix

print("--------------------Checking for duplicates------------------------")
# print("METADATAS: ", metadatas)
# print("S3_PATH: ", incoming_s3_path)
# print("filename: ", incoming_filename)

print("original_filename: ", original_filename)

if incoming_s3_path:
filename = incoming_s3_path
Expand All @@ -1270,7 +1264,6 @@ def check_for_duplicates(self, texts: List[Dict], metadatas: List[Dict[str, Any]
if len(supabase_contents.data) > 0: # if a doc with same filename exists in Supabase
# concatenate texts
supabase_contexts = supabase_contents.data[0]

for text in supabase_contexts['contexts']:
supabase_whole_text += text['text']

Expand All @@ -1282,10 +1275,11 @@ def check_for_duplicates(self, texts: List[Dict], metadatas: List[Dict[str, Any]
if supabase_whole_text == current_whole_text: # matches the previous file
print(f"The file 📄: {filename} is a duplicate!")
return True

else: # the file is updated
print(f"The file 📄: {filename} seems to be updated! Deleting the older file...")

# call the delete function on older docs - ideally should only be 1
# call the delete function on older docs
for content in supabase_contents.data:
print("older s3_path to be deleted: ", content['s3_path'])
delete_status = self.delete_data(course_name, content['s3_path'], '')
Expand Down
3 changes: 3 additions & 0 deletions ai_ta_backend/web_scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import re
import shutil
import time
import uuid
from collections import Counter
from tempfile import NamedTemporaryFile
from zipfile import ZipFile
Expand Down Expand Up @@ -199,6 +200,8 @@ def ingest_file(self, key, course_name, path_name, base_url):
print("Writing", key[2] ,"to temp file")
temp_file.write(key[1])
temp_file.seek(0)
path_name = str(uuid.uuid4()) + '_' + path_name
print("path name in webscrape: ", path_name)
s3_upload_path = "courses/"+ course_name + "/" + path_name + key[2]
with open(temp_file.name, 'rb') as f:
print("Uploading", key[2] ,"to S3")
Expand Down

0 comments on commit b63ca84

Please sign in to comment.