Skip to content

Commit

Permalink
Clean up env vars and minor type errors
Browse files Browse the repository at this point in the history
  • Loading branch information
KastanDay committed Mar 8, 2024
1 parent 484b2a2 commit 016e48b
Show file tree
Hide file tree
Showing 8 changed files with 36 additions and 34 deletions.
2 changes: 1 addition & 1 deletion .env.template
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ SUPABASE_READ_ONLY=
SUPABASE_JWT_SECRET=

MATERIALS_SUPABASE_TABLE=uiuc_chatbot
NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE=documents
SUPABASE_DOCUMENTS_TABLE=documents

# QDRANT
QDRANT_COLLECTION_NAME=uiuc-chatbot
Expand Down
21 changes: 10 additions & 11 deletions ai_ta_backend/beam/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -959,7 +959,7 @@ def split_and_upload(self, texts: List[str], metadatas: List[Dict[str, Any]]):
}

response = self.supabase_client.table(
os.getenv('NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE')).insert(document).execute() # type: ignore
os.getenv('SUPABASE_DOCUMENTS_TABLE')).insert(document).execute() # type: ignore

# add to Nomic document map
if len(response.data) > 0:
Expand Down Expand Up @@ -988,7 +988,7 @@ def check_for_duplicates(self, texts: List[Dict], metadatas: List[Dict[str, Any]
For given metadata, fetch docs from Supabase based on S3 path or URL.
If docs exists, concatenate the texts and compare with current texts, if same, return True.
"""
doc_table = os.getenv('NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE', '')
doc_table = os.getenv('SUPABASE_DOCUMENTS_TABLE', '')
course_name = metadatas[0]['course_name']
incoming_s3_path = metadatas[0]['s3_path']
url = metadatas[0]['url']
Expand Down Expand Up @@ -1087,8 +1087,8 @@ def delete_data(self, course_name: str, s3_path: str, source_url: str):
try:
# delete from Nomic
response = self.supabase_client.from_(
os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).select("id, s3_path, contexts").eq(
's3_path', s3_path).eq('course_name', course_name).execute()
os.environ['SUPABASE_DOCUMENTS_TABLE']).select("id, s3_path, contexts").eq('s3_path', s3_path).eq(
'course_name', course_name).execute()
data = response.data[0] #single record fetched
nomic_ids_to_delete = []
context_count = len(data['contexts'])
Expand All @@ -1102,8 +1102,8 @@ def delete_data(self, course_name: str, s3_path: str, source_url: str):
sentry_sdk.capture_exception(e)

try:
self.supabase_client.from_(os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).delete().eq(
's3_path', s3_path).eq('course_name', course_name).execute()
self.supabase_client.from_(os.environ['SUPABASE_DOCUMENTS_TABLE']).delete().eq('s3_path', s3_path).eq(
'course_name', course_name).execute()
except Exception as e:
print("Error in deleting file from supabase:", e)
sentry_sdk.capture_exception(e)
Expand Down Expand Up @@ -1131,9 +1131,8 @@ def delete_data(self, course_name: str, s3_path: str, source_url: str):
sentry_sdk.capture_exception(e)
try:
# delete from Nomic
response = self.supabase_client.from_(
os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).select("id, url, contexts").eq(
'url', source_url).eq('course_name', course_name).execute()
response = self.supabase_client.from_(os.environ['SUPABASE_DOCUMENTS_TABLE']).select("id, url, contexts").eq(
'url', source_url).eq('course_name', course_name).execute()
data = response.data[0] #single record fetched
nomic_ids_to_delete = []
context_count = len(data['contexts'])
Expand All @@ -1148,8 +1147,8 @@ def delete_data(self, course_name: str, s3_path: str, source_url: str):

try:
# delete from Supabase
self.supabase_client.from_(os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).delete().eq(
'url', source_url).eq('course_name', course_name).execute()
self.supabase_client.from_(os.environ['SUPABASE_DOCUMENTS_TABLE']).delete().eq('url', source_url).eq(
'course_name', course_name).execute()
except Exception as e:
print("Error in deleting file from supabase:", e)
sentry_sdk.capture_exception(e)
Expand Down
21 changes: 10 additions & 11 deletions ai_ta_backend/database/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,26 +13,25 @@ def __init__(self, db_url: str):
supabase_url=os.environ['SUPABASE_URL'], supabase_key=os.environ['SUPABASE_API_KEY'])

def getAllMaterialsForCourse(self, course_name: str):
return self.supabase_client.table(os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).select(
'course_name, s3_path, readable_filename, url, base_url').eq('course_name', course_name).execute()
return self.supabase_client.table(
os.environ['SUPABASE_DOCUMENTS_TABLE']).select('course_name, s3_path, readable_filename, url, base_url').eq(
'course_name', course_name).execute()

def getMaterialsForCourseAndS3Path(self, course_name: str, s3_path: str):
return self.supabase_client.from_(
os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).select("id, s3_path, contexts").eq(
's3_path', s3_path).eq('course_name', course_name).execute()
return self.supabase_client.from_(os.environ['SUPABASE_DOCUMENTS_TABLE']).select("id, s3_path, contexts").eq(
's3_path', s3_path).eq('course_name', course_name).execute()

def getMaterialsForCourseAndKeyAndValue(self, course_name: str, key: str, value: str):
return self.supabase_client.from_(
os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).select("id, s3_path, contexts").eq(key, value).eq(
'course_name', course_name).execute()
return self.supabase_client.from_(os.environ['SUPABASE_DOCUMENTS_TABLE']).select("id, s3_path, contexts").eq(
key, value).eq('course_name', course_name).execute()

def deleteMaterialsForCourseAndKeyAndValue(self, course_name: str, key: str, value: str):
return self.supabase_client.from_(os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).delete().eq(key, value).eq(
return self.supabase_client.from_(os.environ['SUPABASE_DOCUMENTS_TABLE']).delete().eq(key, value).eq(
'course_name', course_name).execute()

def deleteMaterialsForCourseAndS3Path(self, course_name: str, s3_path: str):
return self.supabase_client.from_(os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).delete().eq(
's3_path', s3_path).eq('course_name', course_name).execute()
return self.supabase_client.from_(os.environ['SUPABASE_DOCUMENTS_TABLE']).delete().eq('s3_path', s3_path).eq(
'course_name', course_name).execute()

def getProjectsMapForCourse(self, course_name: str):
return self.supabase_client.table("projects").select("doc_map_id").eq("course_name", course_name).execute()
Expand Down
13 changes: 8 additions & 5 deletions ai_ta_backend/service/export_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ def export_data_in_bg(self, response, download_type, course_name, s3_path):

#s3_file = f"courses/{course_name}/exports/{os.path.basename(zip_file_path)}"
s3_file = f"courses/{course_name}/{os.path.basename(zip_file_path)}"
self.s3.upload_file(zip_file_path, os.getenv('S3_BUCKET_NAME'), s3_file)
self.s3.upload_file(zip_file_path, os.environ['S3_BUCKET_NAME'], s3_file)

# remove local files
os.remove(file_path)
Expand All @@ -157,12 +157,15 @@ def export_data_in_bg(self, response, download_type, course_name, s3_path):
print("file uploaded to s3: ", s3_file)

# generate presigned URL
s3_url = self.s3.generatePresignedUrl('get_object', os.getenv('S3_BUCKET_NAME'), s3_path, 3600)
s3_url = self.s3.generatePresignedUrl('get_object', os.environ['S3_BUCKET_NAME'], s3_path, 3600)

# get admin email IDs
headers = {"Authorization": f"Bearer {os.getenv('VERCEL_READ_ONLY_API_KEY')}", "Content-Type": "application/json"}
headers = {
"Authorization": f"Bearer {os.environ['VERCEL_READ_ONLY_API_KEY']}",
"Content-Type": "application/json"
}

hget_url = str(os.getenv('VERCEL_BASE_URL')) + "course_metadatas/" + course_name
hget_url = str(os.environ['VERCEL_BASE_URL']) + "course_metadatas/" + course_name
response = requests.get(hget_url, headers=headers)
course_metadata = response.json()
course_metadata = json.loads(course_metadata['result'])
Expand All @@ -187,7 +190,7 @@ def export_data_in_bg(self, response, download_type, course_name, s3_path):
# send email to admins
subject = "UIUC.chat Data Export Complete for " + course_name
body_text = "The data export for " + course_name + " is complete.\n\nYou can download the file from the following link: \n\n" + s3_url + "\n\nThis link will expire in 48 hours."
email_status = send_email(subject, body_text, os.getenv('EMAIL_SENDER'), admin_emails, bcc_emails)
email_status = send_email(subject, body_text, os.environ['EMAIL_SENDER'], admin_emails, bcc_emails)
print("email_status: ", email_status)

return "File uploaded to S3. Email sent to admins."
Expand Down
2 changes: 1 addition & 1 deletion ai_ta_backend/service/nomic_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def __init__(self, sentry: SentryService, sql: SQLDatabase):
raise_on_giveup=False,
giveup=giveup_hdlr,
on_backoff=backoff_hdlr)
def log_convo_to_nomic(self, course_name: str, conversation) -> str:
def log_convo_to_nomic(self, course_name: str, conversation) -> str | None:
# nomic.login(os.getenv('NOMIC_API_KEY')) # login during start of flask app
NOMIC_MAP_NAME_PREFIX = 'Conversation Map for '
"""
Expand Down
8 changes: 4 additions & 4 deletions ai_ta_backend/service/retrieval_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,8 @@ def getTopContexts(self, search_query: str, course_name: str, token_limit: int =

pre_prompt = "Please answer the following question. Use the context below, called your documents, only if it's helpful and don't use parts that are very irrelevant. It's good to quote from your documents directly, when you do always use Markdown footnotes for citations. Use react-markdown superscript to number the sources at the end of sentences (1, 2, 3...) and use react-markdown Footnotes to list the full document names for each number. Use ReactMarkdown aka 'react-markdown' formatting for super script citations, use semi-formal style. Feel free to say you don't know. \nHere's a few passages of the high quality documents:\n"
# count tokens at start and end, then also count each context.
token_counter, _ = count_tokens_and_cost(pre_prompt + "\n\nNow please respond to my query: " +
search_query) # type: ignore
token_counter, _ = count_tokens_and_cost(pre_prompt + "\n\nNow please respond to my query: " + # type: ignore
search_query)

valid_docs = []
num_tokens = 0
Expand Down Expand Up @@ -357,8 +357,8 @@ def vector_search(self, search_query, course_name):
for d in search_results:
try:
metadata = d.payload
page_content = metadata["page_content"]
del metadata["page_content"]
page_content = metadata["page_content"] # type: ignore
del metadata["page_content"] # type: ignore
if "pagenumber" not in metadata.keys() and "pagenumber_or_timestamp" in metadata.keys(): # type: ignore
# aiding in the database migration...
metadata["pagenumber"] = metadata["pagenumber_or_timestamp"] # type: ignore
Expand Down
2 changes: 1 addition & 1 deletion ai_ta_backend/utils/context_parent_doc_padding.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import supabase

DOCUMENTS_TABLE = os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']
DOCUMENTS_TABLE = os.environ['SUPABASE_DOCUMENTS_TABLE']
# SUPABASE_CLIENT = supabase.create_client(supabase_url=os.environ['SUPABASE_URL'],
# supabase_key=os.environ['SUPABASE_API_KEY']) # type: ignore

Expand Down
1 change: 1 addition & 0 deletions ai_ta_backend/utils/utils_tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ def count_tokens_and_cost(
completion: str = '',
openai_model_name: str = "gpt-3.5-turbo"): # -> tuple[int, float] | tuple[int, float, int, float]:
"""
# TODO: improve w/ extra tokens used by model: https://github.com/openai/openai-cookbook/blob/d00e9a48a63739f5b038797594c81c8bb494fc09/examples/How_to_count_tokens_with_tiktoken.ipynb
Returns the number of tokens in a text string.
Only the first parameter is required, a string of text to measure. The completion and model name are optional.
Expand Down

0 comments on commit 016e48b

Please sign in to comment.