Skip to content

Commit

Permalink
HOTFIX: critical bug with duplicate doucments in Supabase for EVERY p…
Browse files Browse the repository at this point in the history
…age in doc, fix Github ingest to standardize
  • Loading branch information
KastanDay committed Sep 15, 2023
1 parent 241f819 commit 4f6a863
Showing 1 changed file with 28 additions and 18 deletions.
46 changes: 28 additions & 18 deletions ai_ta_backend/vector_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -681,16 +681,17 @@ def ingest_github(self, github_url: str, course_name: str) -> str:
data = loader.load()
shutil.rmtree("media/cloned_repo")
# create metadata for each file in data
texts = [doc.page_content for doc in data]
metadatas: List[Dict[str, Any]] = [{
'course_name': course_name,
's3_path': '',
'readable_filename': doc.metadata['file_name'],
'url': github_url,
'pagenumber': '',
'timestamp': '',
} for doc in data]
self.split_and_upload(texts=texts, metadatas=metadatas)
for doc in data:
texts = doc.page_content
metadatas: Dict[str, Any] = {
'course_name': course_name,
's3_path': '',
'readable_filename': doc.metadata['file_name'],
'url': github_url,
'pagenumber': '',
'timestamp': '',
}
self.split_and_upload(texts=[texts], metadatas=[metadatas])
return "Success"
except Exception as e:
print(f"ERROR IN GITHUB INGEST {e}")
Expand Down Expand Up @@ -756,14 +757,23 @@ def split_and_upload(self, texts: List[str], metadatas: List[Dict[str, Any]]):
"embedding": embeddings_dict[context.page_content]
} for context in contexts]

document = [{
"course_name": context.metadata.get('course_name'),
"s3_path": context.metadata.get('s3_path'),
"readable_filename": context.metadata.get('readable_filename'),
"url": context.metadata.get('url'),
"base_url": context.metadata.get('base_url'),
"contexts": contexts_for_supa, # should ideally be just one context but getting JSON serialization error when I do that
} for context in contexts]
document = {
"course_name": contexts[0].metadata.get('course_name'),
"s3_path": contexts[0].metadata.get('s3_path'),
"readable_filename": contexts[0].metadata.get('readable_filename'),
"url": contexts[0].metadata.get('url'),
"base_url": contexts[0].metadata.get('base_url'),
"contexts": contexts_for_supa,
}

# document = [{
# "course_name": context.metadata.get('course_name'),
# "s3_path": context.metadata.get('s3_path'),
# "readable_filename": context.metadata.get('readable_filename'),
# "url": context.metadata.get('url'),
# "base_url": context.metadata.get('base_url'),
# "contexts": contexts_for_supa, # should ideally be just one context but getting JSON serialization error when I do that
# } for context in contexts]

count = self.supabase_client.table(os.getenv('NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE')).insert(document).execute() # type: ignore
print("successful END OF split_and_upload")
Expand Down

0 comments on commit 4f6a863

Please sign in to comment.