Add Posthog logs of successful doc ingests. Improve sentry error logs

UIUC-Chatbot · Oct 17, 2024 · 3086af3 · 3086af3
1 parent c3b78e1
commit 3086af3
Show file tree

Hide file tree

Showing 2 changed files with 58 additions and 36 deletions.
diff --git a/.trunk/trunk.yaml b/.trunk/trunk.yaml
@@ -2,12 +2,12 @@
 # To learn more about the format of this file, see https://docs.trunk.io/reference/trunk-yaml
 version: 0.1
 cli:
-  version: 1.20.1
+  version: 1.22.6
 # Trunk provides extensibility via plugins. (https://docs.trunk.io/plugins)
 plugins:
   sources:
     - id: trunk
-      ref: v1.4.3
+      ref: v1.6.4
       uri: https://github.com/trunk-io/plugins
 # Many linters and tools depend on runtimes - configure them here. (https://docs.trunk.io/runtimes)
 runtimes:
@@ -23,20 +23,21 @@ lint:
   enabled:
     # - [email protected]
     # - [email protected]
-    - [email protected]
+    - [email protected]
+    - [email protected]
     - [email protected]
     - [email protected]
-    - actionlint@1.6.26
-    - [email protected].7
-    - [email protected].22
+    - actionlint@1.7.3
+    - [email protected].10
+    - [email protected].267
     - git-diff-check
-    - markdownlint@0.39.0
-    - oxipng@9.0.0
-    - prettier@3.2.5
-    - ruff@0.2.2
-    - shellcheck@0.9.0
+    - markdownlint@0.42.0
+    - oxipng@9.1.2
+    - prettier@3.3.3
+    - ruff@0.6.9
+    - shellcheck@0.10.0
     - [email protected]
-    - trivy@0.49.1
+    - trivy@0.56.2
     - [email protected]
   ignore:
     - linters: [ALL]

diff --git a/ai_ta_backend/beam/ingest.py b/ai_ta_backend/beam/ingest.py
@@ -59,6 +59,15 @@
   from qdrant_client.models import PointStruct
   from supabase.client import ClientOptions
 
+  sentry_sdk.init(
+      dsn=os.getenv("SENTRY_DSN"),
+      # Set traces_sample_rate to 1.0 to capture 100% of transactions for performance monitoring.
+      traces_sample_rate=1.0,
+      # Set profiles_sample_rate to 1.0 to profile 100% of sampled transactions.
+      # We recommend adjusting this value in production.
+      profiles_sample_rate=1.0,
+      enable_tracing=True)
+
 requirements = [
     "openai<1.0",
     "supabase==2.5.3",
@@ -156,14 +165,6 @@ def loader():
   #     openai_api_type=OPENAI_API_TYPE)
 
   posthog = Posthog(sync_mode=True, project_api_key=os.environ['POSTHOG_API_KEY'], host='https://app.posthog.com')
-  sentry_sdk.init(
-      dsn=os.getenv("SENTRY_DSN"),
-      # Set traces_sample_rate to 1.0 to capture 100% of transactions for performance monitoring.
-      traces_sample_rate=1.0,
-      # Set profiles_sample_rate to 1.0 to profile 100% of sampled transactions.
-      # We recommend adjusting this value in production.
-      profiles_sample_rate=1.0,
-      enable_tracing=True)
 
   return qdrant_client, vectorstore, s3_client, supabase_client, posthog
 
@@ -256,6 +257,21 @@ def run_ingest(course_name, s3_paths, base_url, url, readable_filename, content,
     # rebuild_status = rebuild_map(str(course_name), map_type='document')
     pass
 
+  # Success ingest!
+  posthog.capture(
+      'distinct_id_of_the_user',
+      event='ingest_success',
+      properties={
+          'course_name': course_name,
+          's3_path': s3_paths,
+          's3_paths': s3_paths,
+          'url': url,
+          'base_url': base_url,
+          'readable_filename': readable_filename,
+          'content': content,
+          'doc_groups': doc_groups,
+          # TODO: Tokens in entire document
+      })
   print(f"Final success_fail_dict: {success_fail_dict}")
   sentry_sdk.flush(timeout=20)
   return json.dumps(success_fail_dict)
@@ -353,6 +369,7 @@ def _ingest_single(ingest_method: Callable, s3_path, *args, **kwargs):
             success_status['success_ingest'] = s3_path
             print(f"No ingest methods -- Falling back to UTF-8 INGEST... s3_path = {s3_path}")
           except Exception as e:
+            sentry_sdk.capture_exception(e)
             print(
                 f"We don't have a ingest method for this filetype: {file_extension}. As a last-ditch effort, we tried to ingest the file as utf-8 text, but that failed too. File is unsupported: {s3_path}. UTF-8 ingest error: {e}"
             )
@@ -380,6 +397,7 @@ def _ingest_single(ingest_method: Callable, s3_path, *args, **kwargs):
     except Exception as e:
       err = f"❌❌ Error in /ingest: `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc(
       )  # type: ignore
+      sentry_sdk.capture_exception(e)
 
       success_status['failure_ingest'] = {'s3_path': s3_path, 'error': f"MAJOR ERROR DURING INGEST: {err}"}
       self.posthog.capture('distinct_id_of_the_user',
@@ -395,7 +413,8 @@ def _ingest_single(ingest_method: Callable, s3_path, *args, **kwargs):
       print(f"MAJOR ERROR IN /bulk_ingest: {str(e)}")
       return success_status
 
-  def ingest_single_web_text(self, course_name: str, base_url: str, url: str, content: str, readable_filename: str, **kwargs):
+  def ingest_single_web_text(self, course_name: str, base_url: str, url: str, content: str, readable_filename: str,
+                             **kwargs):
     """Crawlee integration
     """
     self.posthog.capture('distinct_id_of_the_user',
@@ -1191,22 +1210,24 @@ def split_and_upload(self, texts: List[str], metadatas: List[Dict[str, Any]], **
         if groups:
           # call the supabase function to add the document to the group
           if contexts[0].metadata.get('url'):
-            data, count = self.supabase_client.rpc('add_document_to_group_url', {
-              "p_course_name": contexts[0].metadata.get('course_name'),
-              "p_s3_path": contexts[0].metadata.get('s3_path'),
-              "p_url": contexts[0].metadata.get('url'),
-              "p_readable_filename": contexts[0].metadata.get('readable_filename'),
-              "p_doc_groups": groups,
-            }).execute()
+            data, count = self.supabase_client.rpc(
+                'add_document_to_group_url', {
+                    "p_course_name": contexts[0].metadata.get('course_name'),
+                    "p_s3_path": contexts[0].metadata.get('s3_path'),
+                    "p_url": contexts[0].metadata.get('url'),
+                    "p_readable_filename": contexts[0].metadata.get('readable_filename'),
+                    "p_doc_groups": groups,
+                }).execute()
           else:
-            data, count = self.supabase_client.rpc('add_document_to_group', {
-              "p_course_name": contexts[0].metadata.get('course_name'),
-              "p_s3_path": contexts[0].metadata.get('s3_path'),
-              "p_url": contexts[0].metadata.get('url'),
-              "p_readable_filename": contexts[0].metadata.get('readable_filename'),
-              "p_doc_groups": groups,
-            }).execute()
-
+            data, count = self.supabase_client.rpc(
+                'add_document_to_group', {
+                    "p_course_name": contexts[0].metadata.get('course_name'),
+                    "p_s3_path": contexts[0].metadata.get('s3_path'),
+                    "p_url": contexts[0].metadata.get('url'),
+                    "p_readable_filename": contexts[0].metadata.get('readable_filename'),
+                    "p_doc_groups": groups,
+                }).execute()
+
           if len(data) == 0:
             print("Error in adding to doc groups")
             raise ValueError("Error in adding to doc groups")