From 8451a6bb01bdc1ddb91532671bc0ff742351d6c6 Mon Sep 17 00:00:00 2001
From: Kastan Day <kastanvday@gmail.com>
Date: Thu, 7 Mar 2024 14:09:16 -0800
Subject: [PATCH 1/9] fix posthog error logs

---
 ai_ta_backend/beam/ingest.py | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/ai_ta_backend/beam/ingest.py b/ai_ta_backend/beam/ingest.py
index 0aaf7d58..b8a44e00 100644
--- a/ai_ta_backend/beam/ingest.py
+++ b/ai_ta_backend/beam/ingest.py
@@ -4,6 +4,7 @@
 """
 import asyncio
 import inspect
+import json
 import logging
 import mimetypes
 import os
@@ -39,6 +40,7 @@
 from langchain.schema import Document
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.vectorstores import Qdrant
+from nomic_logging import delete_from_document_map, log_to_document_map
 from OpenaiEmbeddings import OpenAIAPIProcessor
 from PIL import Image
 from posthog import Posthog
@@ -46,11 +48,6 @@
 from qdrant_client import QdrantClient, models
 from qdrant_client.models import PointStruct
 
-from ai_ta_backend.beam.nomic_logging import (
-    delete_from_document_map,
-    log_to_document_map,
-)
-
 # from langchain.schema.output_parser import StrOutputParser
 # from langchain.chat_models import AzureChatOpenAI
 
@@ -148,7 +145,14 @@ def loader():
 
 
 # Triggers determine how your app is deployed
-@app.rest_api(workers=2, max_pending_tasks=15_000, max_retries=3, timeout=-1, loader=loader, autoscaler=autoscaler)
+@app.rest_api(
+    workers=2,
+    callback_url='https://uiuc-chat-git-refactoringesttobeamserverless-kastanday.vercel.app/api/UIUC-api/ingestCallback',
+    max_pending_tasks=15_000,
+    max_retries=3,
+    timeout=-1,
+    loader=loader,
+    autoscaler=autoscaler)
 def ingest(**inputs: Dict[str, Any]):
   qdrant_client, vectorstore, s3_client, supabase_client, posthog = inputs["context"]
 
@@ -157,8 +161,7 @@ def ingest(**inputs: Dict[str, Any]):
   url: List[str] | str | None = inputs.get('url', None)
   base_url: List[str] | str | None = inputs.get('base_url', None)
   readable_filename: List[str] | str = inputs.get('readable_filename', '')
-  content: str | None = inputs.get('content', None)  # is webtext
-  # is_webtext: bool |  None = inputs.get('url', False)
+  content: str | None = inputs.get('content', None)  # is webtext if content exists
 
   print(
       f"In top of /ingest route. course: {course_name}, s3paths: {s3_paths}, readable_filename: {readable_filename}, base_url: {base_url}, url: {url}, content: {content}"
@@ -177,7 +180,7 @@ def ingest(**inputs: Dict[str, Any]):
                                              base_url=base_url,
                                              url=url)
   print("Final success_fail_dict: ", success_fail_dict)
-  return success_fail_dict
+  return json.dumps(success_fail_dict)
 
 
 class Ingest():
@@ -268,7 +271,7 @@ def _ingest_single(ingest_method: Callable, s3_path, *args, **kwargs):
             )
             self.posthog.capture(
                 'distinct_id_of_the_user',
-                event='Ingest Failure',
+                event='ingest_failure',
                 properties={
                     'course_name':
                         course_name,
@@ -286,7 +289,7 @@ def _ingest_single(ingest_method: Callable, s3_path, *args, **kwargs):
 
       success_status['failure_ingest'].append(f"MAJOR ERROR IN /bulk_ingest: Error: {err}")
       self.posthog.capture('distinct_id_of_the_user',
-                           event='Ingest Failure',
+                           event='ingest_failure',
                            properties={
                                'course_name': course_name,
                                's3_path': s3_paths,

From f6615b1ba9e308d2137c25d33b974388d680b7e9 Mon Sep 17 00:00:00 2001
From: Kastan Day <kastanvday@gmail.com>
Date: Thu, 7 Mar 2024 15:27:30 -0800
Subject: [PATCH 2/9] Increase workers to 3, remove callback url for now

---
 ai_ta_backend/beam/ingest.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ai_ta_backend/beam/ingest.py b/ai_ta_backend/beam/ingest.py
index b8a44e00..e7dd4a92 100644
--- a/ai_ta_backend/beam/ingest.py
+++ b/ai_ta_backend/beam/ingest.py
@@ -146,8 +146,8 @@ def loader():
 
 # Triggers determine how your app is deployed
 @app.rest_api(
-    workers=2,
-    callback_url='https://uiuc-chat-git-refactoringesttobeamserverless-kastanday.vercel.app/api/UIUC-api/ingestCallback',
+    workers=3,
+    # callback_url='https://uiuc-chat-git-refactoringesttobeamserverless-kastanday.vercel.app/api/UIUC-api/ingestCallback',
     max_pending_tasks=15_000,
     max_retries=3,
     timeout=-1,

From db076a7fb48de472cae3c0087905c3e4e5e2a74d Mon Sep 17 00:00:00 2001
From: Kastan Day <kastanvday@gmail.com>
Date: Thu, 7 Mar 2024 15:28:12 -0800
Subject: [PATCH 3/9] Add nomic logging to Beam ingest

---
 ai_ta_backend/beam/nomic_logging.py | 743 ++++++++++++++++++++++++++++
 1 file changed, 743 insertions(+)
 create mode 100644 ai_ta_backend/beam/nomic_logging.py

diff --git a/ai_ta_backend/beam/nomic_logging.py b/ai_ta_backend/beam/nomic_logging.py
new file mode 100644
index 00000000..18591a05
--- /dev/null
+++ b/ai_ta_backend/beam/nomic_logging.py
@@ -0,0 +1,743 @@
+import datetime
+import json
+import os
+import time
+
+import backoff
+import nomic
+import numpy as np
+import pandas as pd
+import sentry_sdk
+import supabase
+from langchain.embeddings import OpenAIEmbeddings
+from nomic import AtlasProject, atlas
+
+OPENAI_API_TYPE = "azure"
+
+SUPABASE_CLIENT = supabase.create_client(  # type: ignore
+    supabase_url=os.getenv('SUPABASE_URL'),  # type: ignore
+    supabase_key=os.getenv('SUPABASE_API_KEY'))  # type: ignore
+
+LOCK_EXCEPTIONS = [
+    'Project is locked for state access! Please wait until the project is unlocked to access embeddings.',
+    'Project is locked for state access! Please wait until the project is unlocked to access data.',
+    'Project is currently indexing and cannot ingest new datums. Try again later.'
+]
+
+
+def giveup_hdlr(e):
+  """
+  Function to handle giveup conditions in backoff decorator
+  Args: 
+    e: Exception raised by the decorated function
+  Returns:
+    True if we want to stop retrying, False otherwise
+  """
+  (e_args,) = e.args
+  e_str = e_args['exception']
+
+  print("giveup_hdlr() called with exception:", e_str)
+  if e_str in LOCK_EXCEPTIONS:
+    return False
+  else:
+    sentry_sdk.capture_exception(e)
+    return True
+
+
+def backoff_hdlr(details):
+  """
+  Function to handle backup conditions in backoff decorator.
+  Currently just prints the details of the backoff.
+  """
+  print(
+      "\nBacking off {wait:0.1f} seconds after {tries} tries, calling function {target} with args {args} and kwargs {kwargs}"
+      .format(**details))
+
+
+def backoff_strategy():
+  """
+  Function to define retry strategy. Is usualy defined in the decorator, 
+  but passing parameters to it is giving errors.
+  """
+  return backoff.expo(base=10, factor=1.5)
+
+
+@backoff.on_exception(backoff_strategy,
+                      Exception,
+                      max_tries=5,
+                      raise_on_giveup=False,
+                      giveup=giveup_hdlr,
+                      on_backoff=backoff_hdlr)
+def log_convo_to_nomic(course_name: str, conversation) -> str:
+  nomic.login(os.getenv('NOMIC_API_KEY'))  # login during start of flask app
+  NOMIC_MAP_NAME_PREFIX = 'Conversation Map for '
+  """
+  Logs conversation to Nomic.
+  1. Check if map exists for given course
+  2. Check if conversation ID exists 
+    - if yes, delete and add new data point
+    - if no, add new data point
+  3. Keep current logic for map doesn't exist - update metadata
+  """
+
+  print(f"in log_convo_to_nomic() for course: {course_name}")
+  print("type of conversation:", type(conversation))
+  #conversation = json.loads(conversation)
+  messages = conversation['conversation']['messages']
+  if 'user_email' not in conversation['conversation']:
+    user_email = "NULL"
+  else:
+    user_email = conversation['conversation']['user_email']
+  conversation_id = conversation['conversation']['id']
+
+  # we have to upload whole conversations
+  # check what the fetched data looks like - pandas df or pyarrow table
+  # check if conversation ID exists in Nomic, if yes fetch all data from it and delete it.
+  # will have current QA and historical QA from Nomic, append new data and add_embeddings()
+
+  project_name = NOMIC_MAP_NAME_PREFIX + course_name
+  start_time = time.monotonic()
+  emoji = ""
+
+  try:
+    # fetch project metadata and embbeddings
+    project = AtlasProject(name=project_name, add_datums_if_exists=True)
+
+    map_metadata_df = project.maps[1].data.df  # type: ignore
+    map_embeddings_df = project.maps[1].embeddings.latent
+    # create a function which returns project, data and embeddings df here
+    map_metadata_df['id'] = map_metadata_df['id'].astype(int)
+    last_id = map_metadata_df['id'].max()
+
+    if conversation_id in map_metadata_df.values:
+      # store that convo metadata locally
+      prev_data = map_metadata_df[map_metadata_df['conversation_id'] == conversation_id]
+      prev_index = prev_data.index.values[0]
+      embeddings = map_embeddings_df[prev_index - 1].reshape(1, 1536)
+      prev_convo = prev_data['conversation'].values[0]
+      prev_id = prev_data['id'].values[0]
+      created_at = pd.to_datetime(prev_data['created_at'].values[0]).strftime('%Y-%m-%d %H:%M:%S')
+
+      # delete that convo data point from Nomic, and print result
+      print("Deleting point from nomic:", project.delete_data([str(prev_id)]))
+
+      # prep for new point
+      first_message = prev_convo.split("\n")[1].split(": ")[1]
+
+      # select the last 2 messages and append new convo to prev convo
+      messages_to_be_logged = messages[-2:]
+      for message in messages_to_be_logged:
+        if message['role'] == 'user':
+          emoji = "🙋 "
+        else:
+          emoji = "🤖 "
+
+        if isinstance(message['content'], list):
+          text = message['content'][0]['text']
+        else:
+          text = message['content']
+
+        prev_convo += "\n>>> " + emoji + message['role'] + ": " + text + "\n"
+
+      # modified timestamp
+      current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+
+      # update metadata
+      metadata = [{
+          "course": course_name,
+          "conversation": prev_convo,
+          "conversation_id": conversation_id,
+          "id": last_id + 1,
+          "user_email": user_email,
+          "first_query": first_message,
+          "created_at": created_at,
+          "modified_at": current_time
+      }]
+    else:
+      print("conversation_id does not exist")
+
+      # add new data point
+      user_queries = []
+      conversation_string = ""
+
+      first_message = messages[0]['content']
+      if isinstance(first_message, list):
+        first_message = first_message[0]['text']
+      user_queries.append(first_message)
+
+      for message in messages:
+        if message['role'] == 'user':
+          emoji = "🙋 "
+        else:
+          emoji = "🤖 "
+
+        if isinstance(message['content'], list):
+          text = message['content'][0]['text']
+        else:
+          text = message['content']
+
+        conversation_string += "\n>>> " + emoji + message['role'] + ": " + text + "\n"
+
+      # modified timestamp
+      current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+
+      metadata = [{
+          "course": course_name,
+          "conversation": conversation_string,
+          "conversation_id": conversation_id,
+          "id": last_id + 1,
+          "user_email": user_email,
+          "first_query": first_message,
+          "created_at": current_time,
+          "modified_at": current_time
+      }]
+
+      # create embeddings
+      embeddings_model = OpenAIEmbeddings(openai_api_type=OPENAI_API_TYPE)  # type: ignore
+      embeddings = embeddings_model.embed_documents(user_queries)
+
+    # add embeddings to the project - create a new function for this
+    project = atlas.AtlasProject(name=project_name, add_datums_if_exists=True)
+    with project.wait_for_project_lock():
+      project.add_embeddings(embeddings=np.array(embeddings), data=pd.DataFrame(metadata))
+      project.rebuild_maps()
+
+    print(f"⏰ Nomic logging runtime: {(time.monotonic() - start_time):.2f} seconds")
+    return f"Successfully logged for {course_name}"
+
+  except Exception as e:
+    if str(e) == 'You must specify a unique_id_field when creating a new project.':
+      print("Attempting to create Nomic map...")
+      result = create_nomic_map(course_name, conversation)
+      print("result of create_nomic_map():", result)
+    else:
+      # raising exception again to trigger backoff and passing parameters to use in create_nomic_map()
+      raise Exception({"exception": str(e)})
+
+
+def get_nomic_map(course_name: str, type: str):
+  """
+  Returns the variables necessary to construct an iframe of the Nomic map given a course name.
+  We just need the ID and URL.
+  Example values:
+    map link: https://atlas.nomic.ai/map/ed222613-97d9-46a9-8755-12bbc8a06e3a/f4967ad7-ff37-4098-ad06-7e1e1a93dd93
+    map id: f4967ad7-ff37-4098-ad06-7e1e1a93dd93
+  """
+  nomic.login(os.getenv('NOMIC_API_KEY'))  # login during start of flask app
+  if type.lower() == 'document':
+    NOMIC_MAP_NAME_PREFIX = 'Document Map for '
+  else:
+    NOMIC_MAP_NAME_PREFIX = 'Conversation Map for '
+
+  project_name = NOMIC_MAP_NAME_PREFIX + course_name
+  start_time = time.monotonic()
+
+  try:
+    project = atlas.AtlasProject(name=project_name, add_datums_if_exists=True)
+    map = project.get_map(project_name)
+
+    print(f"⏰ Nomic Full Map Retrieval: {(time.monotonic() - start_time):.2f} seconds")
+    return {"map_id": f"iframe{map.id}", "map_link": map.map_link}
+  except Exception as e:
+    # Error: ValueError: You must specify a unique_id_field when creating a new project.
+    if str(e) == 'You must specify a unique_id_field when creating a new project.':  # type: ignore
+      print("Nomic map does not exist yet, probably because you have less than 20 queries/documents on your project: ",
+            e)
+    else:
+      print("ERROR in get_nomic_map():", e)
+      sentry_sdk.capture_exception(e)
+    return {"map_id": None, "map_link": None}
+
+
+def create_nomic_map(course_name: str, log_data: list):
+  """
+  Creates a Nomic map for new courses and those which previously had < 20 queries.
+  1. fetches supabase conversations for course
+  2. appends current embeddings and metadata to it
+  2. creates map if there are at least 20 queries
+  """
+  nomic.login(os.getenv('NOMIC_API_KEY'))  # login during start of flask app
+  NOMIC_MAP_NAME_PREFIX = 'Conversation Map for '
+
+  print(f"in create_nomic_map() for {course_name}")
+  # initialize supabase
+  supabase_client = supabase.create_client(  # type: ignore
+      supabase_url=os.getenv('SUPABASE_URL'),  # type: ignore
+      supabase_key=os.getenv('SUPABASE_API_KEY'))  # type: ignore
+
+  try:
+    # fetch all conversations with this new course (we expect <=20 conversations, because otherwise the map should be made already)
+    response = supabase_client.table("llm-convo-monitor").select("*").eq("course_name", course_name).execute()
+    data = response.data
+    df = pd.DataFrame(data)
+
+    if len(data) < 19:
+      return None
+    else:
+      # get all queries for course and create metadata
+      user_queries = []
+      metadata = []
+      i = 1
+      conversation_exists = False
+
+      # current log details
+      log_messages = log_data['conversation']['messages']  # type: ignore
+      log_user_email = log_data['conversation']['user_email']  # type: ignore
+      log_conversation_id = log_data['conversation']['id']  # type: ignore
+
+      for _index, row in df.iterrows():
+        user_email = row['user_email']
+        created_at = pd.to_datetime(row['created_at']).strftime('%Y-%m-%d %H:%M:%S')
+        convo = row['convo']
+        messages = convo['messages']
+
+        first_message = messages[0]['content']
+        if isinstance(first_message, list):
+          first_message = first_message[0]['text']
+
+        user_queries.append(first_message)
+
+        # create metadata for multi-turn conversation
+        conversation = ""
+        for message in messages:
+          # string of role: content, role: content, ...
+          if message['role'] == 'user':  # type: ignore
+            emoji = "🙋 "
+          else:
+            emoji = "🤖 "
+
+          if isinstance(message['content'], list):
+            text = message['content'][0]['text']
+          else:
+            text = message['content']
+
+          conversation += "\n>>> " + emoji + message['role'] + ": " + text + "\n"
+
+        # append current chat to previous chat if convo already exists
+        if convo['id'] == log_conversation_id:
+          conversation_exists = True
+
+          for m in log_messages:
+            if m['role'] == 'user':  # type: ignore
+              emoji = "🙋 "
+            else:
+              emoji = "🤖 "
+
+            if isinstance(m['content'], list):
+              text = m['content'][0]['text']
+            else:
+              text = m['content']
+            conversation += "\n>>> " + emoji + m['role'] + ": " + text + "\n"
+
+        # adding modified timestamp
+        current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+
+        # add to metadata
+        metadata_row = {
+            "course": row['course_name'],
+            "conversation": conversation,
+            "conversation_id": convo['id'],
+            "id": i,
+            "user_email": user_email,
+            "first_query": first_message,
+            "created_at": created_at,
+            "modified_at": current_time
+        }
+        metadata.append(metadata_row)
+        i += 1
+
+      # add current log as a new data point if convo doesn't exist
+      if not conversation_exists:
+        user_queries.append(log_messages[0]['content'])
+        conversation = ""
+        for message in log_messages:
+          if message['role'] == 'user':
+            emoji = "🙋 "
+          else:
+            emoji = "🤖 "
+
+          if isinstance(message['content'], list):
+            text = message['content'][0]['text']
+          else:
+            text = message['content']
+          conversation += "\n>>> " + emoji + message['role'] + ": " + text + "\n"
+
+        # adding timestamp
+        current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+
+        metadata_row = {
+            "course": course_name,
+            "conversation": conversation,
+            "conversation_id": log_conversation_id,
+            "id": i,
+            "user_email": log_user_email,
+            "first_query": log_messages[0]['content'],
+            "created_at": current_time,
+            "modified_at": current_time
+        }
+        metadata.append(metadata_row)
+
+      metadata = pd.DataFrame(metadata)
+      embeddings_model = OpenAIEmbeddings(openai_api_type=OPENAI_API_TYPE)  # type: ignore
+      embeddings = embeddings_model.embed_documents(user_queries)
+
+      # create Atlas project
+      project_name = NOMIC_MAP_NAME_PREFIX + course_name
+      index_name = course_name + "_convo_index"
+      project = atlas.map_embeddings(
+          embeddings=np.array(embeddings),
+          data=metadata,  # type: ignore - this is the correct type, the func signature from Nomic is incomplete
+          id_field='id',
+          build_topic_model=True,
+          topic_label_field='first_query',
+          name=project_name,
+          colorable_fields=['conversation_id', 'first_query'])
+      project.create_index(index_name, build_topic_model=True)
+      return f"Successfully created Nomic map for {course_name}"
+  except Exception as e:
+    # Error: ValueError: You must specify a unique_id_field when creating a new project.
+    if str(e) == 'You must specify a unique_id_field when creating a new project.':  # type: ignore
+      print("Nomic map does not exist yet, probably because you have less than 20 queries on your project: ", e)
+    else:
+      print("ERROR in create_nomic_map():", e)
+      sentry_sdk.capture_exception(e)
+
+    return "failed"
+
+
+## -------------------------------- DOCUMENT MAP FUNCTIONS --------------------------------- ##
+
+
+def create_document_map(course_name: str):
+  """
+  This is a function which creates a document map for a given course from scratch
+    1. Gets count of documents for the course
+    2. If less than 20, returns a message that a map cannot be created
+    3. If greater than 20, iteratively fetches documents in batches of 25
+    4. Prepares metadata and embeddings for nomic upload
+    5. Creates a new map and uploads the data
+
+  Args:
+    course_name: str
+  Returns:
+    str: success or failed
+  """
+  print("in create_document_map()")
+  nomic.login(os.getenv('NOMIC_API_KEY'))
+  NOMIC_MAP_NAME_PREFIX = 'Document Map for '
+
+  # initialize supabase
+  supabase_client = supabase.create_client(  # type: ignore
+      supabase_url=os.getenv('SUPABASE_URL'),  # type: ignore
+      supabase_key=os.getenv('SUPABASE_API_KEY'))  # type: ignore
+
+  try:
+    # check if map exists
+    response = supabase_client.table("projects").select("doc_map_id").eq("course_name", course_name).execute()
+    if response.data:
+      return "Map already exists for this course."
+
+    # fetch relevant document data from Supabase
+    response = supabase_client.table("documents").select("id",
+                                                         count="exact").eq("course_name",
+                                                                           course_name).order('id',
+                                                                                              desc=False).execute()
+    if not response.count:
+      return "No documents found for this course."
+
+    total_doc_count = response.count
+    print("Total number of documents in Supabase: ", total_doc_count)
+
+    # minimum 20 docs needed to create map
+    if total_doc_count > 19:
+
+      first_id = response.data[0]['id']
+      combined_dfs = []
+      curr_total_doc_count = 0
+      doc_count = 0
+      first_batch = True
+
+      # iteratively query in batches of 25
+      while curr_total_doc_count < total_doc_count:
+
+        response = supabase_client.table("documents").select(
+            "id, created_at, s3_path, url, readable_filename, contexts").eq("course_name", course_name).gte(
+                'id', first_id).order('id', desc=False).limit(25).execute()
+        df = pd.DataFrame(response.data)
+        combined_dfs.append(df)  # list of dfs
+
+        curr_total_doc_count += len(response.data)
+        doc_count += len(response.data)
+
+        if doc_count >= 1000:  # upload to Nomic every 1000 docs
+
+          # concat all dfs from the combined_dfs list
+          final_df = pd.concat(combined_dfs, ignore_index=True)
+
+          # prep data for nomic upload
+          embeddings, metadata = data_prep_for_doc_map(final_df)
+
+          if first_batch:
+            # create a new map
+            print("Creating new map...")
+            project_name = NOMIC_MAP_NAME_PREFIX + course_name
+            index_name = course_name + "_doc_index"
+            topic_label_field = "text"
+            colorable_fields = ["readable_filename", "text"]
+            result = create_map(embeddings, metadata, project_name, index_name, topic_label_field, colorable_fields)
+            # update flag
+            first_batch = False
+
+          else:
+            # append to existing map
+            print("Appending data to existing map...")
+            project_name = NOMIC_MAP_NAME_PREFIX + course_name
+            # add project lock logic here
+            result = append_to_map(embeddings, metadata, project_name)
+
+          # reset variables
+          combined_dfs = []
+          doc_count = 0
+
+        # set first_id for next iteration
+        first_id = response.data[-1]['id'] + 1
+
+      # upload last set of docs
+      final_df = pd.concat(combined_dfs, ignore_index=True)
+      embeddings, metadata = data_prep_for_doc_map(final_df)
+      project_name = NOMIC_MAP_NAME_PREFIX + course_name
+      if first_batch:
+        index_name = course_name + "_doc_index"
+        topic_label_field = "text"
+        colorable_fields = ["readable_filename", "text"]
+        result = create_map(embeddings, metadata, project_name, index_name, topic_label_field, colorable_fields)
+      else:
+        result = append_to_map(embeddings, metadata, project_name)
+      print("Atlas upload status: ", result)
+
+      # log info to supabase
+      project = AtlasProject(name=project_name, add_datums_if_exists=True)
+      project_id = project.id
+      project.rebuild_maps()
+      project_info = {'course_name': course_name, 'doc_map_id': project_id}
+      response = supabase_client.table("projects").insert(project_info).execute()
+      print("Response from supabase: ", response)
+      return "success"
+    else:
+      return "Cannot create a map because there are less than 20 documents in the course."
+  except Exception as e:
+    print(e)
+    sentry_sdk.capture_exception(e)
+    return "failed"
+
+
+def delete_from_document_map(course_name: str, ids: list):
+  """
+  This function is used to delete datapoints from a document map.
+  Currently used within the delete_data() function in vector_database.py
+  Args:
+    course_name: str
+    ids: list of str
+  """
+  print("in delete_from_document_map()")
+
+  try:
+    # check if project exists
+    response = SUPABASE_CLIENT.table("projects").select("doc_map_id").eq("course_name", course_name).execute()
+    if response.data:
+      project_id = response.data[0]['doc_map_id']
+    else:
+      return "No document map found for this course"
+
+    # fetch project from Nomic
+    project = AtlasProject(project_id=project_id, add_datums_if_exists=True)
+
+    # delete the ids from Nomic
+    print("Deleting point from document map:", project.delete_data(ids))
+    with project.wait_for_project_lock():
+      project.rebuild_maps()
+    return "Successfully deleted from Nomic map"
+  except Exception as e:
+    print(e)
+    sentry_sdk.capture_exception(e)
+    return "Error in deleting from document map: {e}"
+
+
+def log_to_document_map(data: dict):
+  """
+  This is a function which appends new documents to an existing document map. It's called 
+  at the end of split_and_upload() after inserting data to Supabase.
+  Args:
+    data: dict - the response data from Supabase insertion
+  """
+  print("in add_to_document_map()")
+
+  try:
+    # check if map exists
+    course_name = data['course_name']
+    response = SUPABASE_CLIENT.table("projects").select("doc_map_id").eq("course_name", course_name).execute()
+    if response.data:
+      project_id = response.data[0]['doc_map_id']
+    else:
+      # create a map
+      map_creation_result = create_document_map(course_name)
+      if map_creation_result != "success":
+        return "The project has less than 20 documents and a map cannot be created."
+      else:
+        # fetch project id
+        response = SUPABASE_CLIENT.table("projects").select("doc_map_id").eq("course_name", course_name).execute()
+        project_id = response.data[0]['doc_map_id']
+
+    project = AtlasProject(project_id=project_id, add_datums_if_exists=True)
+    #print("Inserted data: ", data)
+
+    embeddings = []
+    metadata = []
+    context_count = 0
+    # prep data for nomic upload
+    for row in data['contexts']:
+      context_count += 1
+      embeddings.append(row['embedding'])
+      metadata.append({
+          "id": str(data['id']) + "_" + str(context_count),
+          "doc_ingested_at": data['created_at'],
+          "s3_path": data['s3_path'],
+          "url": data['url'],
+          "readable_filename": data['readable_filename'],
+          "created_at": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+          "text": row['text']
+      })
+    embeddings = np.array(embeddings)
+    metadata = pd.DataFrame(metadata)
+    print("Shape of embeddings: ", embeddings.shape)
+
+    # append to existing map
+    project_name = "Document Map for " + course_name
+    result = append_to_map(embeddings, metadata, project_name)
+
+    # check if project is accepting new datums
+    if project.is_accepting_data:
+      with project.wait_for_project_lock():
+        project.rebuild_maps()
+
+    # with project.wait_for_project_lock():
+    #   project.rebuild_maps()
+    return result
+
+  except Exception as e:
+    print(e)
+    sentry_sdk.capture_exception(e)
+    return "Error in appending to map: {e}"
+
+
+def create_map(embeddings, metadata, map_name, index_name, topic_label_field, colorable_fields):
+  """
+  Generic function to create a Nomic map from given parameters.
+  Args:
+    embeddings: np.array of embeddings
+    metadata: pd.DataFrame of metadata
+    map_name: str
+    index_name: str
+    topic_label_field: str
+    colorable_fields: list of str
+  """
+  nomic.login(os.getenv('NOMIC_API_KEY'))
+
+  try:
+    project = atlas.map_embeddings(embeddings=embeddings,
+                                   data=metadata,
+                                   id_field="id",
+                                   build_topic_model=True,
+                                   name=map_name,
+                                   topic_label_field=topic_label_field,
+                                   colorable_fields=colorable_fields,
+                                   add_datums_if_exists=True)
+    project.create_index(index_name, build_topic_model=True)
+    return "success"
+  except Exception as e:
+    print(e)
+    return "Error in creating map: {e}"
+
+
+def append_to_map(embeddings, metadata, map_name):
+  """
+  Generic function to append new data to an existing Nomic map.
+  Args:
+    embeddings: np.array of embeddings
+    metadata: pd.DataFrame of Nomic upload metadata
+    map_name: str
+  """
+  nomic.login(os.getenv('NOMIC_API_KEY'))
+  try:
+    project = atlas.AtlasProject(name=map_name, add_datums_if_exists=True)
+    with project.wait_for_project_lock():
+      project.add_embeddings(embeddings=embeddings, data=metadata)
+    return "Successfully appended to Nomic map"
+  except Exception as e:
+    print(e)
+    return "Error in appending to map: {e}"
+
+
+def data_prep_for_doc_map(df: pd.DataFrame):
+  """
+  This function prepares embeddings and metadata for nomic upload in document map creation.
+  Args:
+    df: pd.DataFrame - the dataframe of documents from Supabase
+  Returns:
+    embeddings: np.array of embeddings
+    metadata: pd.DataFrame of metadata
+  """
+  print("in data_prep_for_doc_map()")
+
+  metadata = []
+  embeddings = []
+  texts = []
+
+  for index, row in df.iterrows():
+
+    current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    if row['url'] == None:
+      row['url'] = ""
+    # iterate through all contexts and create separate entries for each
+    context_count = 0
+    for context in row['contexts']:
+      context_count += 1
+      text_row = context['text']
+      embeddings_row = context['embedding']
+
+      meta_row = {
+          "id": str(row['id']) + "_" + str(context_count),
+          "doc_ingested_at": row['created_at'],
+          "s3_path": row['s3_path'],
+          "url": row['url'],
+          "readable_filename": row['readable_filename'],
+          "created_at": current_time,
+          "text": text_row
+      }
+
+      embeddings.append(embeddings_row)
+      metadata.append(meta_row)
+      texts.append(text_row)
+
+  embeddings_np = np.array(embeddings, dtype=object)
+  print("Shape of embeddings: ", embeddings_np.shape)
+
+  # check dimension if embeddings_np is (n, 1536)
+  if len(embeddings_np.shape) < 2:
+    print("Creating new embeddings...")
+    # embeddings_model = OpenAIEmbeddings(openai_api_type=OPENAI_API_TYPE,
+    #                                     openai_api_base=os.getenv('AZURE_OPENAI_BASE'),
+    #                                     openai_api_key=os.getenv('AZURE_OPENAI_KEY')) # type: ignore
+    embeddings_model = OpenAIEmbeddings(openai_api_type="openai",
+                                        openai_api_base="https://api.openai.com/v1/",
+                                        openai_api_key=os.getenv('VLADS_OPENAI_KEY'))  # type: ignore
+    embeddings = embeddings_model.embed_documents(texts)
+
+  metadata = pd.DataFrame(metadata)
+  embeddings = np.array(embeddings)
+
+  return embeddings, metadata
+
+
+if __name__ == '__main__':
+  pass

From 3abc7eb0053efba7b0d6d205b959d666180493d9 Mon Sep 17 00:00:00 2001
From: Kastan Day <kastanvday@gmail.com>
Date: Fri, 8 Mar 2024 10:51:00 -0800
Subject: [PATCH 4/9] Increase workers from 3 to 4

---
 ai_ta_backend/beam/ingest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ai_ta_backend/beam/ingest.py b/ai_ta_backend/beam/ingest.py
index e7dd4a92..5dfca7fd 100644
--- a/ai_ta_backend/beam/ingest.py
+++ b/ai_ta_backend/beam/ingest.py
@@ -146,7 +146,7 @@ def loader():
 
 # Triggers determine how your app is deployed
 @app.rest_api(
-    workers=3,
+    workers=4,
     # callback_url='https://uiuc-chat-git-refactoringesttobeamserverless-kastanday.vercel.app/api/UIUC-api/ingestCallback',
     max_pending_tasks=15_000,
     max_retries=3,

From 107eef55b45432f70fda926994a7a158cd964067 Mon Sep 17 00:00:00 2001
From: Kastan Day <kastanvday@gmail.com>
Date: Tue, 12 Mar 2024 10:41:24 -0600
Subject: [PATCH 5/9] Fix mimetype parsing for ValueError: not enough values to
 unpack (expected 2, got 1)

---
 ai_ta_backend/beam/ingest.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ai_ta_backend/beam/ingest.py b/ai_ta_backend/beam/ingest.py
index 5dfca7fd..a8ec7786 100644
--- a/ai_ta_backend/beam/ingest.py
+++ b/ai_ta_backend/beam/ingest.py
@@ -14,7 +14,7 @@
 import uuid
 from pathlib import Path
 from tempfile import NamedTemporaryFile
-from typing import Any, Callable, Dict, List, Union
+from typing import Any, Callable, Dict, List, Optional, Union
 
 import beam
 import boto3
@@ -245,7 +245,7 @@ def _ingest_single(ingest_method: Callable, s3_path, *args, **kwargs):
         with NamedTemporaryFile(suffix=file_extension) as tmpfile:
           self.s3_client.download_fileobj(Bucket=os.environ['S3_BUCKET_NAME'], Key=s3_path, Fileobj=tmpfile)
           mime_type = str(mimetypes.guess_type(tmpfile.name, strict=False)[0])
-          mime_category, mime_subcategory = mime_type.split('/')
+          mime_category = mime_type.split('/')[0] if '/' in mime_type else mime_type
 
         if file_extension in file_ingest_methods:
           # Use specialized functions when possible, fallback to mimetype. Else raise error.

From 25655b4d8c29f1167a0951e433fde8f42738d999 Mon Sep 17 00:00:00 2001
From: Kastan Day <kastanvday@gmail.com>
Date: Tue, 12 Mar 2024 17:50:45 -0600
Subject: [PATCH 6/9] HOTFIX /delete. Separate Nomic from Supabase for better
 reliability if either fails

---
 ai_ta_backend/service/retrieval_service.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/ai_ta_backend/service/retrieval_service.py b/ai_ta_backend/service/retrieval_service.py
index 3d73a82b..c9b6a7cc 100644
--- a/ai_ta_backend/service/retrieval_service.py
+++ b/ai_ta_backend/service/retrieval_service.py
@@ -315,7 +315,7 @@ def format_for_json_mqr(self, found_docs) -> List[Dict]:
 
   def delete_from_nomic_and_supabase(self, course_name: str, identifier_key: str, identifier_value: str):
     try:
-      print(f"Deleting from Nomic and Supabase for {course_name} using {identifier_key}: {identifier_value}")
+      print(f"Nomic delete. Course: {course_name} using {identifier_key}: {identifier_value}")
       response = self.sqlDb.getMaterialsForCourseAndKeyAndValue(course_name, identifier_key, identifier_value)
       if not response.data:
         raise Exception(f"No materials found for {course_name} using {identifier_key}: {identifier_value}")
@@ -323,15 +323,17 @@ def delete_from_nomic_and_supabase(self, course_name: str, identifier_key: str,
       nomic_ids_to_delete = [str(data['id']) + "_" + str(i) for i in range(1, len(data['contexts']) + 1)]
 
       # delete from Nomic
-      # check if project exists
       response = self.sqlDb.getProjectsMapForCourse(course_name)
       if not response.data:
         raise Exception(f"No document map found for this course: {course_name}")
       project_id = response.data[0]['doc_map_id']
       self.nomicService.delete_from_document_map(project_id, nomic_ids_to_delete)
+    except Exception as e:
+      print(f"Error in deleting file from Nomic or Supabase using {identifier_key}: {identifier_value}", e)
+      self.sentry.capture_exception(e)
 
-      # delete from Supabase
-      print(f"Deleting from Supabase for {course_name} using {identifier_key}: {identifier_value}")
+    try:
+      print(f"Supabase Delete. course: {course_name} using {identifier_key}: {identifier_value}")
       response = self.sqlDb.deleteMaterialsForCourseAndKeyAndValue(course_name, identifier_key, identifier_value)
     except Exception as e:
       print(f"Error in deleting file from Nomic or Supabase using {identifier_key}: {identifier_value}", e)

From 3b6a6eaa4f571809a95f9aff7ea2ae2caf029f73 Mon Sep 17 00:00:00 2001
From: Kastan Day <kastanvday@gmail.com>
Date: Tue, 12 Mar 2024 18:01:21 -0600
Subject: [PATCH 7/9] Trigger railway deploy

---
 ai_ta_backend/beam/ingest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ai_ta_backend/beam/ingest.py b/ai_ta_backend/beam/ingest.py
index a2f930a0..7eb323c6 100644
--- a/ai_ta_backend/beam/ingest.py
+++ b/ai_ta_backend/beam/ingest.py
@@ -261,7 +261,7 @@ def _ingest_single(ingest_method: Callable, s3_path, *args, **kwargs):
           try:
             self._ingest_single_txt(s3_path, course_name)
             success_status['success_ingest'].append(s3_path)
-            print("✅ FALLBACK TO UTF-8 INGEST WAS SUCCESSFUL :) ")
+            print(f"No ingest methods -- Falling back to UTF-8 INGEST... s3_path = {s3_path}")
           except Exception as e:
             print(
                 f"We don't have a ingest method for this filetype: {file_extension}. As a last-ditch effort, we tried to ingest the file as utf-8 text, but that failed too. File is unsupported: {s3_path}. UTF-8 ingest error: {e}"

From 9d77ab204f2350faf6919a4db53e0fec44a2fdc4 Mon Sep 17 00:00:00 2001
From: Kastan Day <kastanvday@gmail.com>
Date: Tue, 12 Mar 2024 18:04:36 -0600
Subject: [PATCH 8/9] Trigger railway deploy (#232)

---
 ai_ta_backend/beam/ingest.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/ai_ta_backend/beam/ingest.py b/ai_ta_backend/beam/ingest.py
index 7eb323c6..72aae20c 100644
--- a/ai_ta_backend/beam/ingest.py
+++ b/ai_ta_backend/beam/ingest.py
@@ -751,13 +751,12 @@ def _ingest_single_txt(self, s3_path: str, course_name: str, **kwargs) -> str:
     Returns:
         str: "Success" or an error message
     """
-    print("In text ingest")
+    print("In text ingest, UTF-8")
     try:
       # NOTE: slightly different method for .txt files, no need for download. It's part of the 'body'
       response = self.s3_client.get_object(Bucket=os.environ['S3_BUCKET_NAME'], Key=s3_path)
-      print("s3 Resonse:", response)
       text = response['Body'].read().decode('utf-8')
-      print("Text from s3:", text)
+      print("UTF-8 text to ignest (from s3)", text)
       text = [text]
 
       metadatas: List[Dict[str, Any]] = [{

From c249036856776f437ca8224032c4fb945aee0879 Mon Sep 17 00:00:00 2001
From: Kastan Day <kastanvday@gmail.com>
Date: Wed, 13 Mar 2024 09:53:59 -0600
Subject: [PATCH 9/9] Hotfix prints in /delete

---
 ai_ta_backend/service/retrieval_service.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ai_ta_backend/service/retrieval_service.py b/ai_ta_backend/service/retrieval_service.py
index c9b6a7cc..af425218 100644
--- a/ai_ta_backend/service/retrieval_service.py
+++ b/ai_ta_backend/service/retrieval_service.py
@@ -329,14 +329,14 @@ def delete_from_nomic_and_supabase(self, course_name: str, identifier_key: str,
       project_id = response.data[0]['doc_map_id']
       self.nomicService.delete_from_document_map(project_id, nomic_ids_to_delete)
     except Exception as e:
-      print(f"Error in deleting file from Nomic or Supabase using {identifier_key}: {identifier_value}", e)
+      print(f"Nomic Error in deleting. {identifier_key}: {identifier_value}", e)
       self.sentry.capture_exception(e)
 
     try:
       print(f"Supabase Delete. course: {course_name} using {identifier_key}: {identifier_value}")
       response = self.sqlDb.deleteMaterialsForCourseAndKeyAndValue(course_name, identifier_key, identifier_value)
     except Exception as e:
-      print(f"Error in deleting file from Nomic or Supabase using {identifier_key}: {identifier_value}", e)
+      print(f"Supabase Error in delete. {identifier_key}: {identifier_value}", e)
       self.sentry.capture_exception(e)
 
   def vector_search(self, search_query, course_name):