Skip to content

Commit

Permalink
Added Nomic back to code base; migrate to V2 API (#328)
Browse files Browse the repository at this point in the history
* updated convo map creation functions

* added endpoint for convo map updates

* added doc map creation in nomic service

* added doc map update function

* removed nomic logging from beam

* cleaned up code

* modified doc update function to handle large updates (cropwizard)

* updated requirements.txt

* updated comments

* modified the map update APIs to execute in background

* Minor cleanup, adding retry to one update function

---------

Co-authored-by: Kastan Day <[email protected]>
  • Loading branch information
star-nox and KastanDay authored Dec 6, 2024
1 parent 9eb671d commit 8753a78
Show file tree
Hide file tree
Showing 6 changed files with 498 additions and 940 deletions.
42 changes: 1 addition & 41 deletions ai_ta_backend/beam/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Qdrant

#from nomic_logging import delete_from_document_map, log_to_document_map, rebuild_map
from OpenaiEmbeddings import OpenAIAPIProcessor
from PIL import Image
from posthog import Posthog
Expand Down Expand Up @@ -91,7 +90,6 @@
"GitPython==3.1.40",
"beautifulsoup4==4.12.2",
"sentry-sdk==1.39.1",
"nomic==2.0.14",
"pdfplumber==0.11.0", # PDF OCR, better performance than Fitz/PyMuPDF in my Gies PDF testing.
]

Expand Down Expand Up @@ -253,8 +251,6 @@ def run_ingest(course_name, s3_paths, base_url, url, readable_filename, content,
# response = supabase_client.table('documents_failed').insert(document).execute() # type: ignore
# print(f"Supabase ingest failure response: {response}")
else:
# Success case: rebuild nomic document map after all ingests are done
# rebuild_status = rebuild_map(str(course_name), map_type='document')
pass

# Success ingest!
Expand Down Expand Up @@ -1198,11 +1194,6 @@ def split_and_upload(self, texts: List[str], metadatas: List[Dict[str, Any]], **
response = self.supabase_client.table(
os.getenv('REFACTORED_MATERIALS_SUPABASE_TABLE')).insert(document).execute() # type: ignore

# add to Nomic document map
# if len(response.data) > 0:
# course_name = contexts[0].metadata.get('course_name')
# log_to_document_map(course_name)

# need to update Supabase tables with doc group info
if len(response.data) > 0:
# get groups from kwargs
Expand Down Expand Up @@ -1386,22 +1377,6 @@ def delete_data(self, course_name: str, s3_path: str, source_url: str):
else:
print("Error in deleting file from Qdrant:", e)
sentry_sdk.capture_exception(e)
# try:
# # delete from Nomic
# response = self.supabase_client.from_(
# os.environ['REFACTORED_MATERIALS_SUPABASE_TABLE']).select("id, s3_path, contexts").eq('s3_path', s3_path).eq(
# 'course_name', course_name).execute()
# data = response.data[0] #single record fetched
# nomic_ids_to_delete = []
# context_count = len(data['contexts'])
# for i in range(1, context_count + 1):
# nomic_ids_to_delete.append(str(data['id']) + "_" + str(i))

# # delete from Nomic
# delete_from_document_map(course_name, nomic_ids_to_delete)
# except Exception as e:
# print("Error in deleting file from Nomic:", e)
# sentry_sdk.capture_exception(e)

try:
self.supabase_client.from_(os.environ['REFACTORED_MATERIALS_SUPABASE_TABLE']).delete().eq(
Expand Down Expand Up @@ -1431,22 +1406,7 @@ def delete_data(self, course_name: str, s3_path: str, source_url: str):
else:
print("Error in deleting file from Qdrant:", e)
sentry_sdk.capture_exception(e)
# try:
# # delete from Nomic
# response = self.supabase_client.from_(os.environ['REFACTORED_MATERIALS_SUPABASE_TABLE']).select("id, url, contexts").eq(
# 'url', source_url).eq('course_name', course_name).execute()
# data = response.data[0] #single record fetched
# nomic_ids_to_delete = []
# context_count = len(data['contexts'])
# for i in range(1, context_count + 1):
# nomic_ids_to_delete.append(str(data['id']) + "_" + str(i))

# # delete from Nomic
# delete_from_document_map(course_name, nomic_ids_to_delete)
# except Exception as e:
# print("Error in deleting file from Nomic:", e)
# sentry_sdk.capture_exception(e)


try:
# delete from Supabase
self.supabase_client.from_(os.environ['REFACTORED_MATERIALS_SUPABASE_TABLE']).delete().eq(
Expand Down
Loading

0 comments on commit 8753a78

Please sign in to comment.