From 016e48bdb06e3328e4b4db456706b30c74d96ada Mon Sep 17 00:00:00 2001 From: Kastan Day Date: Thu, 7 Mar 2024 16:19:57 -0800 Subject: [PATCH] Clean up env vars and minor type errors --- .env.template | 2 +- ai_ta_backend/beam/ingest.py | 21 +++++++++---------- ai_ta_backend/database/sql.py | 21 +++++++++---------- ai_ta_backend/service/export_service.py | 13 +++++++----- ai_ta_backend/service/nomic_service.py | 2 +- ai_ta_backend/service/retrieval_service.py | 8 +++---- .../utils/context_parent_doc_padding.py | 2 +- ai_ta_backend/utils/utils_tokenization.py | 1 + 8 files changed, 36 insertions(+), 34 deletions(-) diff --git a/.env.template b/.env.template index ba04c704..b007d62b 100644 --- a/.env.template +++ b/.env.template @@ -5,7 +5,7 @@ SUPABASE_READ_ONLY= SUPABASE_JWT_SECRET= MATERIALS_SUPABASE_TABLE=uiuc_chatbot -NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE=documents +SUPABASE_DOCUMENTS_TABLE=documents # QDRANT QDRANT_COLLECTION_NAME=uiuc-chatbot diff --git a/ai_ta_backend/beam/ingest.py b/ai_ta_backend/beam/ingest.py index 0aaf7d58..77f74de6 100644 --- a/ai_ta_backend/beam/ingest.py +++ b/ai_ta_backend/beam/ingest.py @@ -959,7 +959,7 @@ def split_and_upload(self, texts: List[str], metadatas: List[Dict[str, Any]]): } response = self.supabase_client.table( - os.getenv('NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE')).insert(document).execute() # type: ignore + os.getenv('SUPABASE_DOCUMENTS_TABLE')).insert(document).execute() # type: ignore # add to Nomic document map if len(response.data) > 0: @@ -988,7 +988,7 @@ def check_for_duplicates(self, texts: List[Dict], metadatas: List[Dict[str, Any] For given metadata, fetch docs from Supabase based on S3 path or URL. If docs exists, concatenate the texts and compare with current texts, if same, return True. """ - doc_table = os.getenv('NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE', '') + doc_table = os.getenv('SUPABASE_DOCUMENTS_TABLE', '') course_name = metadatas[0]['course_name'] incoming_s3_path = metadatas[0]['s3_path'] url = metadatas[0]['url'] @@ -1087,8 +1087,8 @@ def delete_data(self, course_name: str, s3_path: str, source_url: str): try: # delete from Nomic response = self.supabase_client.from_( - os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).select("id, s3_path, contexts").eq( - 's3_path', s3_path).eq('course_name', course_name).execute() + os.environ['SUPABASE_DOCUMENTS_TABLE']).select("id, s3_path, contexts").eq('s3_path', s3_path).eq( + 'course_name', course_name).execute() data = response.data[0] #single record fetched nomic_ids_to_delete = [] context_count = len(data['contexts']) @@ -1102,8 +1102,8 @@ def delete_data(self, course_name: str, s3_path: str, source_url: str): sentry_sdk.capture_exception(e) try: - self.supabase_client.from_(os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).delete().eq( - 's3_path', s3_path).eq('course_name', course_name).execute() + self.supabase_client.from_(os.environ['SUPABASE_DOCUMENTS_TABLE']).delete().eq('s3_path', s3_path).eq( + 'course_name', course_name).execute() except Exception as e: print("Error in deleting file from supabase:", e) sentry_sdk.capture_exception(e) @@ -1131,9 +1131,8 @@ def delete_data(self, course_name: str, s3_path: str, source_url: str): sentry_sdk.capture_exception(e) try: # delete from Nomic - response = self.supabase_client.from_( - os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).select("id, url, contexts").eq( - 'url', source_url).eq('course_name', course_name).execute() + response = self.supabase_client.from_(os.environ['SUPABASE_DOCUMENTS_TABLE']).select("id, url, contexts").eq( + 'url', source_url).eq('course_name', course_name).execute() data = response.data[0] #single record fetched nomic_ids_to_delete = [] context_count = len(data['contexts']) @@ -1148,8 +1147,8 @@ def delete_data(self, course_name: str, s3_path: str, source_url: str): try: # delete from Supabase - self.supabase_client.from_(os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).delete().eq( - 'url', source_url).eq('course_name', course_name).execute() + self.supabase_client.from_(os.environ['SUPABASE_DOCUMENTS_TABLE']).delete().eq('url', source_url).eq( + 'course_name', course_name).execute() except Exception as e: print("Error in deleting file from supabase:", e) sentry_sdk.capture_exception(e) diff --git a/ai_ta_backend/database/sql.py b/ai_ta_backend/database/sql.py index bf750aa3..b8d4579c 100644 --- a/ai_ta_backend/database/sql.py +++ b/ai_ta_backend/database/sql.py @@ -13,26 +13,25 @@ def __init__(self, db_url: str): supabase_url=os.environ['SUPABASE_URL'], supabase_key=os.environ['SUPABASE_API_KEY']) def getAllMaterialsForCourse(self, course_name: str): - return self.supabase_client.table(os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).select( - 'course_name, s3_path, readable_filename, url, base_url').eq('course_name', course_name).execute() + return self.supabase_client.table( + os.environ['SUPABASE_DOCUMENTS_TABLE']).select('course_name, s3_path, readable_filename, url, base_url').eq( + 'course_name', course_name).execute() def getMaterialsForCourseAndS3Path(self, course_name: str, s3_path: str): - return self.supabase_client.from_( - os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).select("id, s3_path, contexts").eq( - 's3_path', s3_path).eq('course_name', course_name).execute() + return self.supabase_client.from_(os.environ['SUPABASE_DOCUMENTS_TABLE']).select("id, s3_path, contexts").eq( + 's3_path', s3_path).eq('course_name', course_name).execute() def getMaterialsForCourseAndKeyAndValue(self, course_name: str, key: str, value: str): - return self.supabase_client.from_( - os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).select("id, s3_path, contexts").eq(key, value).eq( - 'course_name', course_name).execute() + return self.supabase_client.from_(os.environ['SUPABASE_DOCUMENTS_TABLE']).select("id, s3_path, contexts").eq( + key, value).eq('course_name', course_name).execute() def deleteMaterialsForCourseAndKeyAndValue(self, course_name: str, key: str, value: str): - return self.supabase_client.from_(os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).delete().eq(key, value).eq( + return self.supabase_client.from_(os.environ['SUPABASE_DOCUMENTS_TABLE']).delete().eq(key, value).eq( 'course_name', course_name).execute() def deleteMaterialsForCourseAndS3Path(self, course_name: str, s3_path: str): - return self.supabase_client.from_(os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).delete().eq( - 's3_path', s3_path).eq('course_name', course_name).execute() + return self.supabase_client.from_(os.environ['SUPABASE_DOCUMENTS_TABLE']).delete().eq('s3_path', s3_path).eq( + 'course_name', course_name).execute() def getProjectsMapForCourse(self, course_name: str): return self.supabase_client.table("projects").select("doc_map_id").eq("course_name", course_name).execute() diff --git a/ai_ta_backend/service/export_service.py b/ai_ta_backend/service/export_service.py index 61469ec4..ad095d79 100644 --- a/ai_ta_backend/service/export_service.py +++ b/ai_ta_backend/service/export_service.py @@ -148,7 +148,7 @@ def export_data_in_bg(self, response, download_type, course_name, s3_path): #s3_file = f"courses/{course_name}/exports/{os.path.basename(zip_file_path)}" s3_file = f"courses/{course_name}/{os.path.basename(zip_file_path)}" - self.s3.upload_file(zip_file_path, os.getenv('S3_BUCKET_NAME'), s3_file) + self.s3.upload_file(zip_file_path, os.environ['S3_BUCKET_NAME'], s3_file) # remove local files os.remove(file_path) @@ -157,12 +157,15 @@ def export_data_in_bg(self, response, download_type, course_name, s3_path): print("file uploaded to s3: ", s3_file) # generate presigned URL - s3_url = self.s3.generatePresignedUrl('get_object', os.getenv('S3_BUCKET_NAME'), s3_path, 3600) + s3_url = self.s3.generatePresignedUrl('get_object', os.environ['S3_BUCKET_NAME'], s3_path, 3600) # get admin email IDs - headers = {"Authorization": f"Bearer {os.getenv('VERCEL_READ_ONLY_API_KEY')}", "Content-Type": "application/json"} + headers = { + "Authorization": f"Bearer {os.environ['VERCEL_READ_ONLY_API_KEY']}", + "Content-Type": "application/json" + } - hget_url = str(os.getenv('VERCEL_BASE_URL')) + "course_metadatas/" + course_name + hget_url = str(os.environ['VERCEL_BASE_URL']) + "course_metadatas/" + course_name response = requests.get(hget_url, headers=headers) course_metadata = response.json() course_metadata = json.loads(course_metadata['result']) @@ -187,7 +190,7 @@ def export_data_in_bg(self, response, download_type, course_name, s3_path): # send email to admins subject = "UIUC.chat Data Export Complete for " + course_name body_text = "The data export for " + course_name + " is complete.\n\nYou can download the file from the following link: \n\n" + s3_url + "\n\nThis link will expire in 48 hours." - email_status = send_email(subject, body_text, os.getenv('EMAIL_SENDER'), admin_emails, bcc_emails) + email_status = send_email(subject, body_text, os.environ['EMAIL_SENDER'], admin_emails, bcc_emails) print("email_status: ", email_status) return "File uploaded to S3. Email sent to admins." diff --git a/ai_ta_backend/service/nomic_service.py b/ai_ta_backend/service/nomic_service.py index 5b6c4e38..45a4074f 100644 --- a/ai_ta_backend/service/nomic_service.py +++ b/ai_ta_backend/service/nomic_service.py @@ -71,7 +71,7 @@ def __init__(self, sentry: SentryService, sql: SQLDatabase): raise_on_giveup=False, giveup=giveup_hdlr, on_backoff=backoff_hdlr) - def log_convo_to_nomic(self, course_name: str, conversation) -> str: + def log_convo_to_nomic(self, course_name: str, conversation) -> str | None: # nomic.login(os.getenv('NOMIC_API_KEY')) # login during start of flask app NOMIC_MAP_NAME_PREFIX = 'Conversation Map for ' """ diff --git a/ai_ta_backend/service/retrieval_service.py b/ai_ta_backend/service/retrieval_service.py index 4f12ea8b..92174ba4 100644 --- a/ai_ta_backend/service/retrieval_service.py +++ b/ai_ta_backend/service/retrieval_service.py @@ -75,8 +75,8 @@ def getTopContexts(self, search_query: str, course_name: str, token_limit: int = pre_prompt = "Please answer the following question. Use the context below, called your documents, only if it's helpful and don't use parts that are very irrelevant. It's good to quote from your documents directly, when you do always use Markdown footnotes for citations. Use react-markdown superscript to number the sources at the end of sentences (1, 2, 3...) and use react-markdown Footnotes to list the full document names for each number. Use ReactMarkdown aka 'react-markdown' formatting for super script citations, use semi-formal style. Feel free to say you don't know. \nHere's a few passages of the high quality documents:\n" # count tokens at start and end, then also count each context. - token_counter, _ = count_tokens_and_cost(pre_prompt + "\n\nNow please respond to my query: " + - search_query) # type: ignore + token_counter, _ = count_tokens_and_cost(pre_prompt + "\n\nNow please respond to my query: " + # type: ignore + search_query) valid_docs = [] num_tokens = 0 @@ -357,8 +357,8 @@ def vector_search(self, search_query, course_name): for d in search_results: try: metadata = d.payload - page_content = metadata["page_content"] - del metadata["page_content"] + page_content = metadata["page_content"] # type: ignore + del metadata["page_content"] # type: ignore if "pagenumber" not in metadata.keys() and "pagenumber_or_timestamp" in metadata.keys(): # type: ignore # aiding in the database migration... metadata["pagenumber"] = metadata["pagenumber_or_timestamp"] # type: ignore diff --git a/ai_ta_backend/utils/context_parent_doc_padding.py b/ai_ta_backend/utils/context_parent_doc_padding.py index 8521d99e..e015b018 100644 --- a/ai_ta_backend/utils/context_parent_doc_padding.py +++ b/ai_ta_backend/utils/context_parent_doc_padding.py @@ -6,7 +6,7 @@ import supabase -DOCUMENTS_TABLE = os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE'] +DOCUMENTS_TABLE = os.environ['SUPABASE_DOCUMENTS_TABLE'] # SUPABASE_CLIENT = supabase.create_client(supabase_url=os.environ['SUPABASE_URL'], # supabase_key=os.environ['SUPABASE_API_KEY']) # type: ignore diff --git a/ai_ta_backend/utils/utils_tokenization.py b/ai_ta_backend/utils/utils_tokenization.py index 7070ea7f..956cc196 100644 --- a/ai_ta_backend/utils/utils_tokenization.py +++ b/ai_ta_backend/utils/utils_tokenization.py @@ -9,6 +9,7 @@ def count_tokens_and_cost( completion: str = '', openai_model_name: str = "gpt-3.5-turbo"): # -> tuple[int, float] | tuple[int, float, int, float]: """ + # TODO: improve w/ extra tokens used by model: https://github.com/openai/openai-cookbook/blob/d00e9a48a63739f5b038797594c81c8bb494fc09/examples/How_to_count_tokens_with_tiktoken.ipynb Returns the number of tokens in a text string. Only the first parameter is required, a string of text to measure. The completion and model name are optional.