Skip to content

Commit

Permalink
Merge branch 'main' into file-update
Browse files Browse the repository at this point in the history
  • Loading branch information
star-nox authored Nov 16, 2023
2 parents b63ca84 + 0d371ba commit 273d598
Showing 1 changed file with 54 additions and 72 deletions.
126 changes: 54 additions & 72 deletions ai_ta_backend/vector_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ def _ingest_single_py(self, s3_path: str, course_name: str, **kwargs):
metadatas: List[Dict[str, Any]] = [{
'course_name': course_name,
's3_path': s3_path,
'readable_filename': kwargs['readable_filename'] if 'readable_filename' in kwargs.keys() else Path(s3_path).name[37:],
'readable_filename': kwargs.get('readable_filename', Path(s3_path).name[37:),
'pagenumber': '',
'timestamp': '',
'url': '',
Expand All @@ -183,7 +183,9 @@ def _ingest_single_py(self, s3_path: str, course_name: str, **kwargs):
return success_or_failure

except Exception as e:
print(f"ERROR IN py READING {e}")
err = f"❌❌ Error in (Python ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc()
print(err)
return err

def _ingest_single_vtt(self, s3_path: str, course_name: str, **kwargs):
"""
Expand All @@ -200,7 +202,7 @@ def _ingest_single_vtt(self, s3_path: str, course_name: str, **kwargs):
metadatas: List[Dict[str, Any]] = [{
'course_name': course_name,
's3_path': s3_path,
'readable_filename': kwargs['readable_filename'] if 'readable_filename' in kwargs.keys() else Path(s3_path).name[37:],
'readable_filename': kwargs.get('readable_filename', Path(s3_path).name[37:]),
'pagenumber': '',
'timestamp': '',
'url': '',
Expand All @@ -210,9 +212,12 @@ def _ingest_single_vtt(self, s3_path: str, course_name: str, **kwargs):
success_or_failure = self.split_and_upload(texts=texts, metadatas=metadatas)
return success_or_failure
except Exception as e:
print(f"ERROR IN VTT READING {e}")
err = f"❌❌ Error in (VTT ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc()
print(err)
return err

def _ingest_html(self, s3_path: str, course_name: str, **kwargs) -> str:
print(f"IN _ingest_html s3_path `{s3_path}` kwargs: {kwargs}")
try:
response = self.s3_client.get_object(Bucket=os.environ['S3_BUCKET_NAME'], Key=s3_path)
raw_html = response['Body'].read().decode('utf-8')
Expand All @@ -223,29 +228,15 @@ def _ingest_html(self, s3_path: str, course_name: str, **kwargs) -> str:
title = title.replace("_", " ")
title = title.replace("/", " ")
title = title.strip()
title = title[37:] # remove unqiue ID from the filename
print("KWARGS: ", kwargs)
if kwargs == {}:
url = ''
base_url = ''
else:
if 'url' in kwargs.keys():
url = kwargs['url']
else:
url = ''
if 'base_url' in kwargs.keys():
base_url = kwargs['base_url']
else:
base_url = ''

title = title[37:] # removing the uuid prefix
text = [soup.get_text()]

metadata: List[Dict[str, Any]] = [{
'course_name': course_name,
's3_path': s3_path,
'readable_filename': str(title), # adding str to avoid error: unhashable type 'slice'
'url': url,
'base_url': base_url,
'url': kwargs.get('url', ''),
'base_url': kwargs.get('base_url', ''),
'pagenumber': '',
'timestamp': '',
}]
Expand All @@ -256,7 +247,7 @@ def _ingest_html(self, s3_path: str, course_name: str, **kwargs) -> str:
except Exception as e:
err: str = f"ERROR IN _ingest_html: {e}\nTraceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}" # type: ignore
print(err)
return f"_ingest_html Error: {e}"
return err

def _ingest_single_video(self, s3_path: str, course_name: str, **kwargs) -> str:
"""
Expand Down Expand Up @@ -319,7 +310,7 @@ def _ingest_single_video(self, s3_path: str, course_name: str, **kwargs) -> str:
metadatas: List[Dict[str, Any]] = [{
'course_name': course_name,
's3_path': s3_path,
'readable_filename': kwargs['readable_filename'] if 'readable_filename' in kwargs.keys() else Path(s3_path).name[37:],
'readable_filename': kwargs.get('readable_filename', Path(s3_path).name[37:]),
'pagenumber': '',
'timestamp': text.index(txt),
'url': '',
Expand All @@ -329,9 +320,9 @@ def _ingest_single_video(self, s3_path: str, course_name: str, **kwargs) -> str:
self.split_and_upload(texts=text, metadatas=metadatas)
return "Success"
except Exception as e:
print("ERROR IN VIDEO READING ")
print(e)
return f"Error {e}"
err = f"❌❌ Error in (VIDEO ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc()
print(err)
return err

def _ingest_single_docx(self, s3_path: str, course_name: str, **kwargs) -> str:
try:
Expand All @@ -345,7 +336,7 @@ def _ingest_single_docx(self, s3_path: str, course_name: str, **kwargs) -> str:
metadatas: List[Dict[str, Any]] = [{
'course_name': course_name,
's3_path': s3_path,
'readable_filename': kwargs['readable_filename'] if 'readable_filename' in kwargs.keys() else Path(s3_path).name[37:],
'readable_filename': kwargs.get('readable_filename', Path(s3_path).name[37:]),
'pagenumber': '',
'timestamp': '',
'url': '',
Expand All @@ -355,8 +346,9 @@ def _ingest_single_docx(self, s3_path: str, course_name: str, **kwargs) -> str:
self.split_and_upload(texts=texts, metadatas=metadatas)
return "Success"
except Exception as e:
print(f"❌❌ Error in (DOCX ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.print_exc())
return f"❌❌ Error in (DOCX ingest): {e}"
err = f"❌❌ Error in (DOCX ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc()
print(err)
return err

def _ingest_single_srt(self, s3_path: str, course_name: str, **kwargs) -> str:
try:
Expand All @@ -371,7 +363,7 @@ def _ingest_single_srt(self, s3_path: str, course_name: str, **kwargs) -> str:
metadatas: List[Dict[str, Any]] = [{
'course_name': course_name,
's3_path': s3_path,
'readable_filename': kwargs['readable_filename'] if 'readable_filename' in kwargs.keys() else Path(s3_path).name[37:],
'readable_filename': kwargs.get('readable_filename', Path(s3_path).name[37:]),
'pagenumber': '',
'timestamp': '',
'url': '',
Expand All @@ -381,8 +373,9 @@ def _ingest_single_srt(self, s3_path: str, course_name: str, **kwargs) -> str:
self.split_and_upload(texts=texts, metadatas=metadatas)
return "Success"
except Exception as e:
print(f"❌❌ Error in (SRT ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.print_exc())
return f"❌❌ Error in (SRT ingest): {e}"
err = f"❌❌ Error in (SRT ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc()
print(err)
return err

def _ingest_single_excel(self, s3_path: str, course_name: str, **kwargs) -> str:
try:
Expand All @@ -398,7 +391,7 @@ def _ingest_single_excel(self, s3_path: str, course_name: str, **kwargs) -> str:
metadatas: List[Dict[str, Any]] = [{
'course_name': course_name,
's3_path': s3_path,
'readable_filename': kwargs['readable_filename'] if 'readable_filename' in kwargs.keys() else Path(s3_path).name[37:],
'readable_filename': kwargs.get('readable_filename', Path(s3_path).name[37:]),
'pagenumber': '',
'timestamp': '',
'url': '',
Expand All @@ -408,8 +401,9 @@ def _ingest_single_excel(self, s3_path: str, course_name: str, **kwargs) -> str:
self.split_and_upload(texts=texts, metadatas=metadatas)
return "Success"
except Exception as e:
print(f"❌❌ Error in (Excel/xlsx ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.print_exc())
return f"Error: {e}"
err = f"❌❌ Error in (Excel/xlsx ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc()
print(err)
return err

def _ingest_single_image(self, s3_path: str, course_name: str, **kwargs) -> str:
try:
Expand All @@ -432,7 +426,7 @@ def _ingest_single_image(self, s3_path: str, course_name: str, **kwargs) -> str:
metadatas: List[Dict[str, Any]] = [{
'course_name': course_name,
's3_path': s3_path,
'readable_filename': kwargs['readable_filename'] if 'readable_filename' in kwargs.keys() else Path(s3_path).name[37:],
'readable_filename': kwargs.get('readable_filename', Path(s3_path).name[37:]),
'pagenumber': '',
'timestamp': '',
'url': '',
Expand All @@ -442,8 +436,9 @@ def _ingest_single_image(self, s3_path: str, course_name: str, **kwargs) -> str:
self.split_and_upload(texts=texts, metadatas=metadatas)
return "Success"
except Exception as e:
print(f"❌❌ Error in (png/jpg ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.print_exc())
return f"Error: {e}"
err = f"❌❌ Error in (png/jpg ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc()
print(err)
return err

def _ingest_single_csv(self, s3_path: str, course_name: str, **kwargs) -> str:
try:
Expand All @@ -458,7 +453,7 @@ def _ingest_single_csv(self, s3_path: str, course_name: str, **kwargs) -> str:
metadatas: List[Dict[str, Any]] = [{
'course_name': course_name,
's3_path': s3_path,
'readable_filename': kwargs['readable_filename'] if 'readable_filename' in kwargs.keys() else Path(s3_path).name[37:],
'readable_filename': kwargs.get('readable_filename', Path(s3_path).name[37:]),
'pagenumber': '',
'timestamp': '',
'url': '',
Expand All @@ -468,8 +463,9 @@ def _ingest_single_csv(self, s3_path: str, course_name: str, **kwargs) -> str:
self.split_and_upload(texts=texts, metadatas=metadatas)
return "Success"
except Exception as e:
print(f"❌❌ Error in (CSV ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.print_exc())
return f"❌❌ Error in (CSV ingest): {e}"
err = f"❌❌ Error in (CSV ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc()
print(err)
return err

def _ingest_single_pdf(self, s3_path: str, course_name: str, **kwargs):
"""
Expand Down Expand Up @@ -510,33 +506,15 @@ def _ingest_single_pdf(self, s3_path: str, course_name: str, **kwargs):
text = page.get_text().encode("utf8").decode("utf8", errors='ignore') # get plain text (is in UTF-8)
pdf_pages_OCRed.append(dict(text=text, page_number=i, readable_filename=Path(s3_path).name[37:]))

# Webscrape kwargs
if 'kwargs' in kwargs.keys() and kwargs['kwargs'] == {}:
url = ''
base_url = ''
elif 'kwargs' not in kwargs.keys():
url = ''
base_url = ''
else:
if 'url' in kwargs['kwargs'].keys():
url = kwargs['kwargs']['url']
else:
url = ''
if 'base_url' in kwargs['kwargs'].keys():
base_url = kwargs['kwargs']['base_url']
else:
base_url = ''


metadatas: List[Dict[str, Any]] = [
{
'course_name': course_name,
's3_path': s3_path,
'pagenumber': page['page_number'] + 1, # +1 for human indexing
'timestamp': '',
'readable_filename': kwargs['readable_filename'] if 'readable_filename' in kwargs.keys() else page['readable_filename'],
'url': url,
'base_url': base_url,
'readable_filename': kwargs.get('readable_filename', page['readable_filename']),
'url': kwargs.get('url', ''),
'base_url': kwargs.get('base_url', ''),
} for page in pdf_pages_OCRed
]
pdf_texts = [page['text'] for page in pdf_pages_OCRed]
Expand All @@ -545,8 +523,9 @@ def _ingest_single_pdf(self, s3_path: str, course_name: str, **kwargs):
print("PDF message: ", success_or_failure)
return success_or_failure
except Exception as e:
print(f"❌❌ Error in (PDF ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.print_exc())
return f"❌❌ Error in (PDF ingest): {e}"
err = f"❌❌ Error in (PDF ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc()
print(err)
return err
return "Success"

def _ingest_single_txt(self, s3_path: str, course_name: str, **kwargs) -> str:
Expand All @@ -569,7 +548,7 @@ def _ingest_single_txt(self, s3_path: str, course_name: str, **kwargs) -> str:
metadatas: List[Dict[str, Any]] = [{
'course_name': course_name,
's3_path': s3_path,
'readable_filename': kwargs['readable_filename'] if 'readable_filename' in kwargs.keys() else Path(s3_path).name[37:],
'readable_filename': kwargs.get('readable_filename', Path(s3_path).name[37:]),
'pagenumber': '',
'timestamp': '',
'url': '',
Expand All @@ -580,8 +559,9 @@ def _ingest_single_txt(self, s3_path: str, course_name: str, **kwargs) -> str:
success_or_failure = self.split_and_upload(texts=text, metadatas=metadatas)
return success_or_failure
except Exception as e:
print(f"❌❌ Error in (TXT ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.print_exc())
return f"Error: {e}"
err = f"❌❌ Error in (TXT ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc()
print(err)
return err

def _ingest_single_ppt(self, s3_path: str, course_name: str, **kwargs) -> str:
"""
Expand All @@ -600,7 +580,7 @@ def _ingest_single_ppt(self, s3_path: str, course_name: str, **kwargs) -> str:
metadatas: List[Dict[str, Any]] = [{
'course_name': course_name,
's3_path': s3_path,
'readable_filename': kwargs['readable_filename'] if 'readable_filename' in kwargs.keys() else Path(s3_path).name[37:],
'readable_filename': kwargs.get('readable_filename', Path(s3_path).name[37:]),
'pagenumber': '',
'timestamp': '',
'url': '',
Expand All @@ -610,8 +590,9 @@ def _ingest_single_ppt(self, s3_path: str, course_name: str, **kwargs) -> str:
self.split_and_upload(texts=texts, metadatas=metadatas)
return "Success"
except Exception as e:
print(f"❌❌ Error in (PPTX ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.print_exc())
return f"Error: {e}"
err = f"❌❌ Error in (PPTX ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc()
print(err)
return err

def list_files_recursively(self, bucket, prefix):
all_files = []
Expand Down Expand Up @@ -718,8 +699,9 @@ def ingest_github(self, github_url: str, course_name: str) -> str:
self.split_and_upload(texts=[texts], metadatas=[metadatas])
return "Success"
except Exception as e:
print(f"❌❌ Error in (GITHUB ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.print_exc())
return f"❌❌ Error in (GITHUB ingest): {e}"
err = f"❌❌ Error in (GITHUB ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n{traceback.format_exc()}"
print(err)
return err

def split_and_upload(self, texts: List[str], metadatas: List[Dict[str, Any]]):
""" This is usually the last step of document ingest. Chunk & upload to Qdrant (and Supabase.. todo).
Expand Down

0 comments on commit 273d598

Please sign in to comment.