From 9460933ef5239d2b49966d133af3ad7b5b161f22 Mon Sep 17 00:00:00 2001 From: Kastan Day Date: Wed, 6 Mar 2024 07:49:08 -0800 Subject: [PATCH 1/3] Add detailed posthog logging for /ingest failures --- ai_ta_backend/beam/ingest.py | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/ai_ta_backend/beam/ingest.py b/ai_ta_backend/beam/ingest.py index 09ebdb2d..81e6fab9 100644 --- a/ai_ta_backend/beam/ingest.py +++ b/ai_ta_backend/beam/ingest.py @@ -260,10 +260,34 @@ def _ingest_single(ingest_method: Callable, s3_path, *args, **kwargs): success_status['failure_ingest'].append( f"We don't have a ingest method for this filetype: {file_extension} (with generic type {mime_type}), for file: {s3_path}" ) + self.posthog.capture( + 'distinct_id_of_the_user', + event='Ingest Failure', + properties={ + 'course_name': + course_name, + 's3_path': + s3_paths, + 'kwargs': + kwargs, + 'error': + f"We don't have a ingest method for this filetype: {file_extension} (with generic type {mime_type}), for file: {s3_path}" + }) return success_status except Exception as e: - success_status['failure_ingest'].append(f"MAJOR ERROR IN /bulk_ingest: Error: {str(e)}") + err = f"❌❌ Error in /ingest: `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc() + + success_status['failure_ingest'].append(f"MAJOR ERROR IN /bulk_ingest: Error: {err}") + self.posthog.capture('distinct_id_of_the_user', + event='Ingest Failure', + properties={ + 'course_name': course_name, + 's3_path': s3_paths, + 'kwargs': kwargs, + 'error': err + }) + sentry_sdk.capture_exception(e) print(f"MAJOR ERROR IN /bulk_ingest: Error: {str(e)}") return success_status From 44e9c1197b8801c328de73f1aa428ecd7d841d3d Mon Sep 17 00:00:00 2001 From: Kastan Day Date: Wed, 6 Mar 2024 07:53:55 -0800 Subject: [PATCH 2/3] Add nomic logging and deleting back to /ingest --- ai_ta_backend/beam/ingest.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/ai_ta_backend/beam/ingest.py b/ai_ta_backend/beam/ingest.py index 81e6fab9..dcc96d25 100644 --- a/ai_ta_backend/beam/ingest.py +++ b/ai_ta_backend/beam/ingest.py @@ -46,6 +46,11 @@ from qdrant_client import QdrantClient, models from qdrant_client.models import PointStruct +from ai_ta_backend.beam.nomic_logging import ( + delete_from_document_map, + log_to_document_map, +) + # from langchain.schema.output_parser import StrOutputParser # from langchain.chat_models import AzureChatOpenAI @@ -957,10 +962,8 @@ def split_and_upload(self, texts: List[str], metadatas: List[Dict[str, Any]]): # add to Nomic document map if len(response.data) > 0: - pass - # TODO: reimplement nomic - # inserted_data = response.data[0] - # res = log_to_document_map(inserted_data) + inserted_data = response.data[0] + log_to_document_map(inserted_data) self.posthog.capture('distinct_id_of_the_user', event='split_and_upload_succeeded', @@ -1092,7 +1095,7 @@ def delete_data(self, course_name: str, s3_path: str, source_url: str): nomic_ids_to_delete.append(str(data['id']) + "_" + str(i)) # delete from Nomic - # res = delete_from_document_map(course_name, nomic_ids_to_delete) + delete_from_document_map(course_name, nomic_ids_to_delete) except Exception as e: print("Error in deleting file from Nomic:", e) sentry_sdk.capture_exception(e) @@ -1137,8 +1140,7 @@ def delete_data(self, course_name: str, s3_path: str, source_url: str): nomic_ids_to_delete.append(str(data['id']) + "_" + str(i)) # delete from Nomic - # TODO: reimplement... - # res = delete_from_document_map(course_name, nomic_ids_to_delete) + delete_from_document_map(course_name, nomic_ids_to_delete) except Exception as e: print("Error in deleting file from Nomic:", e) sentry_sdk.capture_exception(e) From 3281c7063297208647ce6fdef637cb665331fc00 Mon Sep 17 00:00:00 2001 From: Kastan Day Date: Wed, 6 Mar 2024 07:54:56 -0800 Subject: [PATCH 3/3] Add nomic dependency --- ai_ta_backend/beam/ingest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ai_ta_backend/beam/ingest.py b/ai_ta_backend/beam/ingest.py index dcc96d25..0aaf7d58 100644 --- a/ai_ta_backend/beam/ingest.py +++ b/ai_ta_backend/beam/ingest.py @@ -77,6 +77,7 @@ "GitPython==3.1.40", "beautifulsoup4==4.12.2", "sentry-sdk==1.39.1", + "nomic==2.0.14", ] # TODO: consider adding workers. They share CPU and memory https://docs.beam.cloud/deployment/autoscaling#worker-use-cases