Skip to content

Commit

Permalink
Add Posthog logs of successful doc ingests. Improve sentry error logs
Browse files Browse the repository at this point in the history
  • Loading branch information
KastanDay committed Oct 17, 2024
1 parent c3b78e1 commit 3086af3
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 36 deletions.
25 changes: 13 additions & 12 deletions .trunk/trunk.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@
# To learn more about the format of this file, see https://docs.trunk.io/reference/trunk-yaml
version: 0.1
cli:
version: 1.20.1
version: 1.22.6
# Trunk provides extensibility via plugins. (https://docs.trunk.io/plugins)
plugins:
sources:
- id: trunk
ref: v1.4.3
ref: v1.6.4
uri: https://github.com/trunk-io/plugins
# Many linters and tools depend on runtimes - configure them here. (https://docs.trunk.io/runtimes)
runtimes:
Expand All @@ -23,20 +23,21 @@ lint:
enabled:
# - [email protected]
# - [email protected]
- [email protected]
- [email protected]
- [email protected]
- [email protected]
- [email protected]
- actionlint@1.6.26
- [email protected].7
- [email protected].22
- actionlint@1.7.3
- [email protected].10
- [email protected].267
- git-diff-check
- markdownlint@0.39.0
- oxipng@9.0.0
- prettier@3.2.5
- ruff@0.2.2
- shellcheck@0.9.0
- markdownlint@0.42.0
- oxipng@9.1.2
- prettier@3.3.3
- ruff@0.6.9
- shellcheck@0.10.0
- [email protected]
- trivy@0.49.1
- trivy@0.56.2
- [email protected]
ignore:
- linters: [ALL]
Expand Down
69 changes: 45 additions & 24 deletions ai_ta_backend/beam/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,15 @@
from qdrant_client.models import PointStruct
from supabase.client import ClientOptions

sentry_sdk.init(
dsn=os.getenv("SENTRY_DSN"),
# Set traces_sample_rate to 1.0 to capture 100% of transactions for performance monitoring.
traces_sample_rate=1.0,
# Set profiles_sample_rate to 1.0 to profile 100% of sampled transactions.
# We recommend adjusting this value in production.
profiles_sample_rate=1.0,
enable_tracing=True)

requirements = [
"openai<1.0",
"supabase==2.5.3",
Expand Down Expand Up @@ -156,14 +165,6 @@ def loader():
# openai_api_type=OPENAI_API_TYPE)

posthog = Posthog(sync_mode=True, project_api_key=os.environ['POSTHOG_API_KEY'], host='https://app.posthog.com')
sentry_sdk.init(
dsn=os.getenv("SENTRY_DSN"),
# Set traces_sample_rate to 1.0 to capture 100% of transactions for performance monitoring.
traces_sample_rate=1.0,
# Set profiles_sample_rate to 1.0 to profile 100% of sampled transactions.
# We recommend adjusting this value in production.
profiles_sample_rate=1.0,
enable_tracing=True)

return qdrant_client, vectorstore, s3_client, supabase_client, posthog

Expand Down Expand Up @@ -256,6 +257,21 @@ def run_ingest(course_name, s3_paths, base_url, url, readable_filename, content,
# rebuild_status = rebuild_map(str(course_name), map_type='document')
pass

# Success ingest!
posthog.capture(
'distinct_id_of_the_user',
event='ingest_success',
properties={
'course_name': course_name,
's3_path': s3_paths,
's3_paths': s3_paths,
'url': url,
'base_url': base_url,
'readable_filename': readable_filename,
'content': content,
'doc_groups': doc_groups,
# TODO: Tokens in entire document
})
print(f"Final success_fail_dict: {success_fail_dict}")
sentry_sdk.flush(timeout=20)
return json.dumps(success_fail_dict)
Expand Down Expand Up @@ -353,6 +369,7 @@ def _ingest_single(ingest_method: Callable, s3_path, *args, **kwargs):
success_status['success_ingest'] = s3_path
print(f"No ingest methods -- Falling back to UTF-8 INGEST... s3_path = {s3_path}")
except Exception as e:
sentry_sdk.capture_exception(e)
print(
f"We don't have a ingest method for this filetype: {file_extension}. As a last-ditch effort, we tried to ingest the file as utf-8 text, but that failed too. File is unsupported: {s3_path}. UTF-8 ingest error: {e}"
)
Expand Down Expand Up @@ -380,6 +397,7 @@ def _ingest_single(ingest_method: Callable, s3_path, *args, **kwargs):
except Exception as e:
err = f"❌❌ Error in /ingest: `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc(
) # type: ignore
sentry_sdk.capture_exception(e)

success_status['failure_ingest'] = {'s3_path': s3_path, 'error': f"MAJOR ERROR DURING INGEST: {err}"}
self.posthog.capture('distinct_id_of_the_user',
Expand All @@ -395,7 +413,8 @@ def _ingest_single(ingest_method: Callable, s3_path, *args, **kwargs):
print(f"MAJOR ERROR IN /bulk_ingest: {str(e)}")
return success_status

def ingest_single_web_text(self, course_name: str, base_url: str, url: str, content: str, readable_filename: str, **kwargs):
def ingest_single_web_text(self, course_name: str, base_url: str, url: str, content: str, readable_filename: str,
**kwargs):
"""Crawlee integration
"""
self.posthog.capture('distinct_id_of_the_user',
Expand Down Expand Up @@ -1191,22 +1210,24 @@ def split_and_upload(self, texts: List[str], metadatas: List[Dict[str, Any]], **
if groups:
# call the supabase function to add the document to the group
if contexts[0].metadata.get('url'):
data, count = self.supabase_client.rpc('add_document_to_group_url', {
"p_course_name": contexts[0].metadata.get('course_name'),
"p_s3_path": contexts[0].metadata.get('s3_path'),
"p_url": contexts[0].metadata.get('url'),
"p_readable_filename": contexts[0].metadata.get('readable_filename'),
"p_doc_groups": groups,
}).execute()
data, count = self.supabase_client.rpc(
'add_document_to_group_url', {
"p_course_name": contexts[0].metadata.get('course_name'),
"p_s3_path": contexts[0].metadata.get('s3_path'),
"p_url": contexts[0].metadata.get('url'),
"p_readable_filename": contexts[0].metadata.get('readable_filename'),
"p_doc_groups": groups,
}).execute()
else:
data, count = self.supabase_client.rpc('add_document_to_group', {
"p_course_name": contexts[0].metadata.get('course_name'),
"p_s3_path": contexts[0].metadata.get('s3_path'),
"p_url": contexts[0].metadata.get('url'),
"p_readable_filename": contexts[0].metadata.get('readable_filename'),
"p_doc_groups": groups,
}).execute()

data, count = self.supabase_client.rpc(
'add_document_to_group', {
"p_course_name": contexts[0].metadata.get('course_name'),
"p_s3_path": contexts[0].metadata.get('s3_path'),
"p_url": contexts[0].metadata.get('url'),
"p_readable_filename": contexts[0].metadata.get('readable_filename'),
"p_doc_groups": groups,
}).execute()

if len(data) == 0:
print("Error in adding to doc groups")
raise ValueError("Error in adding to doc groups")
Expand Down

0 comments on commit 3086af3

Please sign in to comment.