Skip to content

Commit

Permalink
implemented pagination in supabase fetch query
Browse files Browse the repository at this point in the history
  • Loading branch information
star-nox committed Nov 20, 2023
1 parent 3156d34 commit 788c063
Showing 1 changed file with 113 additions and 56 deletions.
169 changes: 113 additions & 56 deletions ai_ta_backend/nomic_logging.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
import datetime
import os
import time

import requests
import nomic
import numpy as np
import pandas as pd
import supabase
from langchain.embeddings import OpenAIEmbeddings
from nomic import AtlasProject, atlas
from nomic.project import AtlasClass


def log_convo_to_nomic(course_name: str, conversation) -> str:
Expand Down Expand Up @@ -296,6 +297,32 @@ def create_nomic_map(course_name: str, log_data: list):
project.create_index(index_name, build_topic_model=True)
return f"Successfully created Nomic map for {course_name}"

def list_projects(organization_id=None):
"""
Lists all projects in an organization.
If called without an organization id, it will list all projects in the
current user's main organization.
"""
c = AtlasClass()
if organization_id is None:
organization = c._get_current_users_main_organization()
if organization is None:
raise ValueError(
"No organization id provided and no main organization found."
)
organization_id = organization['organization_id']
response = requests.get(
c.atlas_api_path + f"/v1/organization/{organization_id}",
headers=c.header
)
proj_info = response.json()['projects']
return [
{'name': p['project_name'],
'id': p['id'],
'created_timestamp': p['created_timestamp']}
for p in proj_info
]

def create_map_for_all_courses():
"""
Creates a map for all the courses across UIUC.Chat
Expand All @@ -308,81 +335,111 @@ def create_map_for_all_courses():
supabase_url=os.getenv('SUPABASE_URL'), # type: ignore
supabase_key=os.getenv('SUPABASE_API_KEY')) # type: ignore

# get a list of all courses
courses = supabase_client.table("llm-convo-monitor").select("course_name").execute()
data = courses.data
df = pd.DataFrame(data)
course_list = df['course_name'].unique()
course_list = list(course_list)
course_list.remove(None) # there is one course called None, remove it

project_exists = False
existing_projects = list_projects()
for course in course_list:
print("Course name: ", course)
try:
response = supabase_client.table("llm-convo-monitor").select("*").eq('course_name', course).execute()
data = response.data
course_df = pd.DataFrame(data)
print("Number of rows in the course: ", len(course_df))
# check if course already has a map
project_name = NOMIC_MAP_NAME_PREFIX + course

if len(course_df) < 20:
continue
else:
for project in existing_projects:
if "name" in project and project['name'] == project_name:
project_exists = True
break

if project_exists:
project_exists = False
continue
else:
print("Creating map for course: ", course)

# fetch IDs of all conversations for this course
id_response = supabase_client.table("llm-convo-monitor").select("id").eq('course_name', course).order('id', desc=False).execute()
id_data = id_response.data
print("Total IDs: ", len(id_data))

if len(id_data) < 20: # cannot create a map with less than 20 queries
continue
else:
course_df = pd.DataFrame()
first_id = id_data[0]['id']
last_id = id_data[-1]['id']

i = 0
current_data = []
while len(current_data) < len(id_data):
if len(current_data) == 0:
response = supabase_client.table("llm-convo-monitor").select("*").eq('course_name', course).gte('id', first_id).order('id', desc=False).limit(25).execute()
else:
response = supabase_client.table("llm-convo-monitor").select("*").eq('course_name', course).gt('id', first_id).order('id', desc=False).limit(25).execute()

current_data += response.data
first_id = current_data[-1]['id']
i += 1

course_df = pd.DataFrame(current_data)
print("Number of converstions: ", len(course_df))

user_queries = []
metadata = []
i = 1

# log conversation instead of individual messages
for index, row in course_df.iterrows():
user_email = row['user_email']
created_at = pd.to_datetime(row['created_at']).strftime('%Y-%m-%d %H:%M:%S')
convo = row['convo']
messages = convo['messages']
first_message = messages[0]['content']

current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

user_queries.append(first_message)
# create metadata for multi-turn conversation
conversation = ""
for message in messages:
# string of role: content, role: content, ...
if message['role'] == 'user':
emoji = "🙋"
else:
emoji = "🤖"
conversation += "\n>>> " + emoji + message['role'] + ": " + message['content'] + "\n"
user_email = row['user_email']
created_at = pd.to_datetime(row['created_at']).strftime('%Y-%m-%d %H:%M:%S')
convo = row['convo']
messages = convo['messages']
first_message = messages[0]['content']
current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
user_queries.append(first_message)
# create metadata for multi-turn conversation
conversation = ""
for message in messages:
# string of role: content, role: content, ...
if message['role'] == 'user':
emoji = "🙋"
else:
emoji = "🤖"
conversation += "\n>>> " + emoji + message['role'] + ": " + message['content'] + "\n"
# add to metadata
metadata_row = {"course": row['course_name'], "conversation": conversation, "conversation_id": convo['id'],
"id": i, "user_email": user_email, "first_query": first_message, "created_at": created_at,
"modified_at": current_time}
metadata.append(metadata_row)
i += 1

print("Length of user queries: ", len(user_queries))
print("Length of metadata: ", len(metadata))

metadata = pd.DataFrame(metadata)
embeddings_model = OpenAIEmbeddings()
embeddings = embeddings_model.embed_documents(user_queries)
embeddings = np.array(embeddings)
print(embeddings.shape)

# create an Atlas project
project_name = NOMIC_MAP_NAME_PREFIX + course
index_name = course + "_convo_index"
project = atlas.map_embeddings(embeddings=np.array(embeddings),
data=metadata,
id_field='id',
build_topic_model=True,
topic_label_field='first_query',
name=project_name,
colorable_fields=['conversation_id', 'first_query', 'created_at', 'modified_at'])
print(project.maps)

project.create_index(index_name, build_topic_model=True)
"id": i, "user_email": user_email, "first_query": first_message, "created_at": created_at, "modified_at": current_time}

metadata.append(metadata_row)
i += 1

print("Length of user queries: ", len(user_queries))
print("Length of metadata: ", len(metadata))

metadata = pd.DataFrame(metadata)
embeddings_model = OpenAIEmbeddings() # type: ignore
embeddings = embeddings_model.embed_documents(user_queries)
embeddings = np.array(embeddings)
print(embeddings.shape)

# create an Atlas project
index_name = course + "_convo_index"
project = atlas.map_embeddings(embeddings=np.array(embeddings),
data=metadata,
id_field='id',
build_topic_model=True,
topic_label_field='first_query',
name=project_name,
colorable_fields=['conversation_id', 'first_query', 'created_at', 'modified_at'])
print("Project Maps: ", project.maps)
project.create_index(index_name, build_topic_model=True)
except Exception as e:
print("course_name:", course)
print("error: ", e)
print("Failed to create map due to error: ", e)

if __name__ == '__main__':
pass

0 comments on commit 788c063

Please sign in to comment.