diff --git a/ai_ta_backend/vector_database.py b/ai_ta_backend/vector_database.py index 08aa37ed..66622314 100644 --- a/ai_ta_backend/vector_database.py +++ b/ai_ta_backend/vector_database.py @@ -681,16 +681,17 @@ def ingest_github(self, github_url: str, course_name: str) -> str: data = loader.load() shutil.rmtree("media/cloned_repo") # create metadata for each file in data - texts = [doc.page_content for doc in data] - metadatas: List[Dict[str, Any]] = [{ - 'course_name': course_name, - 's3_path': '', - 'readable_filename': doc.metadata['file_name'], - 'url': github_url, - 'pagenumber': '', - 'timestamp': '', - } for doc in data] - self.split_and_upload(texts=texts, metadatas=metadatas) + for doc in data: + texts = doc.page_content + metadatas: Dict[str, Any] = { + 'course_name': course_name, + 's3_path': '', + 'readable_filename': doc.metadata['file_name'], + 'url': github_url, + 'pagenumber': '', + 'timestamp': '', + } + self.split_and_upload(texts=[texts], metadatas=[metadatas]) return "Success" except Exception as e: print(f"ERROR IN GITHUB INGEST {e}") @@ -756,14 +757,23 @@ def split_and_upload(self, texts: List[str], metadatas: List[Dict[str, Any]]): "embedding": embeddings_dict[context.page_content] } for context in contexts] - document = [{ - "course_name": context.metadata.get('course_name'), - "s3_path": context.metadata.get('s3_path'), - "readable_filename": context.metadata.get('readable_filename'), - "url": context.metadata.get('url'), - "base_url": context.metadata.get('base_url'), - "contexts": contexts_for_supa, # should ideally be just one context but getting JSON serialization error when I do that - } for context in contexts] + document = { + "course_name": contexts[0].metadata.get('course_name'), + "s3_path": contexts[0].metadata.get('s3_path'), + "readable_filename": contexts[0].metadata.get('readable_filename'), + "url": contexts[0].metadata.get('url'), + "base_url": contexts[0].metadata.get('base_url'), + "contexts": contexts_for_supa, + } + + # document = [{ + # "course_name": context.metadata.get('course_name'), + # "s3_path": context.metadata.get('s3_path'), + # "readable_filename": context.metadata.get('readable_filename'), + # "url": context.metadata.get('url'), + # "base_url": context.metadata.get('base_url'), + # "contexts": contexts_for_supa, # should ideally be just one context but getting JSON serialization error when I do that + # } for context in contexts] count = self.supabase_client.table(os.getenv('NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE')).insert(document).execute() # type: ignore print("successful END OF split_and_upload")